diff --git a/.gitignore b/.gitignore
new file mode 100644
index 0000000..0829a7b
--- /dev/null
+++ b/.gitignore
@@ -0,0 +1,52 @@
+*.rknn
+*.pt
+*.torchscript
+*.onnx
+*.log
+*.pyc
+*.zip
+*.npy
+*.pkl
+*.pb
+
+check*.onnx
+
+# ignore the following folder name
+build
+install
+
+datasets/COCO/annotations/*
+datasets/COCO/val2017/*
+
+breakpoint_output/
+
+models/CV/body_analysis/
+models/CV/classification/
+models/CV/image_segmentation/
+tmp/
+
+dump_data_distribute/
+snapshot/
+snapshot*/
+
+# ignore file
+models/CV/object_detection/yolo/RKNN_python_demo/*.json
+capi_tools/internal/fixed_frequency_android.sh
+capi_tools/internal/fixed_frequency.sh
+common/onnx_breakpoint_tools.py
+report.yml
+
+developer_conference/
+internal_data/
+.vscode/
+model_cvt
+datasets/COCO/try_load.py
+common/utils/run_and_gen_result.py
+common/utils/update_driver.py
+result/
+*DS_Store*
+
+
+# for unittest
+unittest/example_test_result
+unittest/env_config.py
diff --git a/3rdparty/CMakeLists.txt b/3rdparty/CMakeLists.txt
index 1785b85..3990558 100644
--- a/3rdparty/CMakeLists.txt
+++ b/3rdparty/CMakeLists.txt
@@ -25,9 +25,10 @@ set(LIBJPEG ${JPEG_PATH}/${CMAKE_SYSTEM_NAME}/${TARGET_LIB_ARCH}/libturbojpeg.a
 set(LIBJPEG_INCLUDES ${JPEG_PATH}/include PARENT_SCOPE)
 
 # rknn runtime
-if (TARGET_SOC STREQUAL "rk3588" OR TARGET_SOC STREQUAL "rk356x" OR TARGET_SOC STREQUAL "rv1106" OR TARGET_SOC STREQUAL "rv1103")
+# for rknpu2
+if (TARGET_SOC STREQUAL "rk3588" OR TARGET_SOC STREQUAL "rk3576" OR TARGET_SOC STREQUAL "rk356x" OR TARGET_SOC STREQUAL "rv1106" OR TARGET_SOC STREQUAL "rv1103")
     set(RKNN_PATH ${CMAKE_CURRENT_SOURCE_DIR}/rknpu2)
-    if (TARGET_SOC STREQUAL "rk3588" OR TARGET_SOC STREQUAL "rk356x")
+    if (TARGET_SOC STREQUAL "rk3588" OR TARGET_SOC STREQUAL "rk356x" OR TARGET_SOC STREQUAL "rk3576" )
         set(LIBRKNNRT ${RKNN_PATH}/${CMAKE_SYSTEM_NAME}/${TARGET_LIB_ARCH}/librknnrt.so)
     endif()
     if (TARGET_SOC STREQUAL "rv1106" OR TARGET_SOC STREQUAL "rv1103")
@@ -35,6 +36,15 @@ if (TARGET_SOC STREQUAL "rk3588" OR TARGET_SOC STREQUAL "rk356x" OR TARGET_SOC S
     endif()
     set(LIBRKNNRT_INCLUDES ${RKNN_PATH}/include PARENT_SCOPE)
 endif()
+
+# for rknpu1
+if(TARGET_SOC STREQUAL "rk1808" OR TARGET_SOC STREQUAL "rv1109" OR TARGET_SOC STREQUAL "rv1126")
+    set(RKNN_PATH ${CMAKE_CURRENT_SOURCE_DIR}/rknpu1)
+
+    set(LIBRKNNRT ${RKNN_PATH}/${CMAKE_SYSTEM_NAME}/${TARGET_LIB_ARCH}/librknn_api.so)
+
+    set(LIBRKNNRT_INCLUDES ${RKNN_PATH}/include PARENT_SCOPE)
+endif()
 install(PROGRAMS ${LIBRKNNRT} DESTINATION lib)
 set(LIBRKNNRT ${LIBRKNNRT} PARENT_SCOPE)
 
diff --git a/3rdparty/allocator/dma/dma_alloc.hpp b/3rdparty/allocator/dma/dma_alloc.hpp
new file mode 100644
index 0000000..53cc7e8
--- /dev/null
+++ b/3rdparty/allocator/dma/dma_alloc.hpp
@@ -0,0 +1,163 @@
+/*
+ * Copyright (C) 2022 Rockchip Electronics Co., Ltd.
+ * Authors:
+ *  Cerf Yu <cerf.yu@rock-chips.com>
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include <getopt.h>
+#include <sys/mman.h>
+#include <sys/socket.h>
+#include <sys/stat.h>
+#include <time.h>
+#include <stdbool.h>
+#include <assert.h>
+#include <endian.h>
+#include <errno.h>
+#include <fcntl.h>
+#include <stdarg.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+#include <sys/ioctl.h>
+#include <sys/stat.h>
+#include <sys/types.h>
+#include <sys/poll.h>
+#include <unistd.h>
+#include <stdbool.h>
+#include <sys/eventfd.h>
+
+#include <sched.h>
+#include <pthread.h>
+
+#include <stdint.h>
+#include <math.h>
+#include <memory.h>
+#include <sys/time.h>
+
+#include "dma_alloc.h"
+#include "RgaUtils.h"
+
+typedef unsigned long long __u64;
+typedef  unsigned int __u32;
+
+struct dma_heap_allocation_data {
+	__u64 len;
+	__u32 fd;
+	__u32 fd_flags;
+	__u64 heap_flags;
+};
+
+struct dma_heap_import_data {
+	__u64 len;
+	__u32 fd;
+	__u32 fd_flags;
+	__u64 heap_flags;
+};
+
+#define DMA_HEAP_IOC_MAGIC		'H'
+#define DMA_HEAP_IOCTL_ALLOC	_IOWR(DMA_HEAP_IOC_MAGIC, 0x0,\
+				      struct dma_heap_allocation_data)
+#define DMA_HEAP_IOCTL_IMPORT	_IOWR(DMA_HEAP_IOC_MAGIC, 0x1,\
+				      struct dma_heap_import_data)
+
+#define DMA_BUF_SYNC_READ      (1 << 0)
+#define DMA_BUF_SYNC_WRITE     (2 << 0)
+#define DMA_BUF_SYNC_RW        (DMA_BUF_SYNC_READ | DMA_BUF_SYNC_WRITE)
+#define DMA_BUF_SYNC_START     (0 << 2)
+#define DMA_BUF_SYNC_END       (1 << 2)
+
+struct dma_buf_sync {
+	__u64 flags;
+};
+
+#define DMA_BUF_BASE		'b'
+#define DMA_BUF_IOCTL_SYNC	_IOW(DMA_BUF_BASE, 0, struct dma_buf_sync)
+
+#define CMA_HEAP_SIZE	1024 * 1024
+
+int dma_sync_device_to_cpu(int fd) {
+    struct dma_buf_sync sync = {0};
+
+    sync.flags = DMA_BUF_SYNC_START | DMA_BUF_SYNC_RW;
+    return ioctl(fd, DMA_BUF_IOCTL_SYNC, &sync);
+}
+
+int dma_sync_cpu_to_device(int fd) {
+    struct dma_buf_sync sync = {0};
+
+    sync.flags = DMA_BUF_SYNC_END | DMA_BUF_SYNC_RW;
+    return ioctl(fd, DMA_BUF_IOCTL_SYNC, &sync);
+}
+
+int dma_buf_alloc(const char *path, size_t size, int *fd, void **va) {
+    int ret;
+    int prot;
+    void *mmap_va;
+    int dma_heap_fd = -1;
+    struct dma_heap_allocation_data buf_data;
+
+    /* open dma_heap fd */
+    if (dma_heap_fd < 0) {
+        dma_heap_fd = open(path, O_RDWR);
+        if (dma_heap_fd < 0) {
+            printf("open %s fail!\n", path);
+            return dma_heap_fd;
+        }
+    }
+
+    /* alloc buffer */
+    memset(&buf_data, 0x0, sizeof(struct dma_heap_allocation_data));
+
+    buf_data.len = size;
+    buf_data.fd_flags = O_CLOEXEC | O_RDWR;
+    ret = ioctl(dma_heap_fd, DMA_HEAP_IOCTL_ALLOC, &buf_data);
+    if (ret < 0) {
+        printf("RK_DMA_HEAP_ALLOC_BUFFER failed\n");
+        return ret;
+    }
+
+    /* mmap va */
+    if (fcntl(buf_data.fd, F_GETFL) & O_RDWR)
+        prot = PROT_READ | PROT_WRITE;
+    else
+        prot = PROT_READ;
+
+    /* mmap contiguors buffer to user */
+    mmap_va = (void *)mmap(NULL, buf_data.len, prot, MAP_SHARED, buf_data.fd, 0);
+    if (mmap_va == MAP_FAILED) {
+        printf("mmap failed: %s\n", strerror(errno));
+        return -errno;
+    }
+
+    *va = mmap_va;
+    *fd = buf_data.fd;
+
+    close(dma_heap_fd);
+
+    return 0;
+}
+
+void dma_buf_free(size_t size, int *fd, void *va) {
+    int len;
+
+    len =  size;
+    munmap(va, len);
+
+    close(*fd);
+    *fd = -1;
+}
+
+
+
diff --git a/3rdparty/allocator/drm/drm_alloc.hpp b/3rdparty/allocator/drm/drm_alloc.hpp
new file mode 100644
index 0000000..b387a4b
--- /dev/null
+++ b/3rdparty/allocator/drm/drm_alloc.hpp
@@ -0,0 +1,277 @@
+/*
+ * Copyright (C) 2022  Rockchip Electronics Co., Ltd.
+ * Authors:
+ *     Randall zhuo <randall.zhuo@rock-chips.com>
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#if defined (__arm__) || defined (__aarch64__)
+#include <stdint.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+#include <sys/fcntl.h> // open function
+#include <unistd.h>    // close function
+#include <errno.h>
+#include <sys/mman.h>
+#include <dlfcn.h>
+
+#include <linux/input.h>
+#include "drm_fourcc.h"
+#include "xf86drm.h"
+
+typedef int(* DRM_IOCTL)(int fd, unsigned long request, void *arg);
+static DRM_IOCTL drmIoctl_func = NULL;
+static void *drm_handle = NULL;
+static int drm_fd = -1;
+
+struct drm_rockchip_gem_phys {
+    uint32_t handle;
+    uint32_t phy_addr;
+};
+
+#define DRM_ROCKCHIP_GEM_GET_PHYS   0x04
+#define DRM_IOCTL_ROCKCHIP_GEM_GET_PHYS     DRM_IOWR(DRM_COMMAND_BASE + \
+        DRM_ROCKCHIP_GEM_GET_PHYS, struct drm_rockchip_gem_phys)
+
+
+static int drm_init()
+{
+    static const char *card = "/dev/dri/card0";
+    int flag = O_RDWR;
+    int drm_fd = -1;
+
+    drm_fd = open(card, flag);
+    if (drm_fd < 0)
+    {
+        printf("failed to open %s\n", card);
+        return -1;
+    }
+    return drm_fd;
+}
+
+static void drm_deinit(int drm_fd)
+{
+    if (drm_fd > 0)
+    {
+        close(drm_fd);
+    }
+}
+
+void *drm_buf_alloc(int TexWidth, int TexHeight, int bpp, int *fd,  int *handle, size_t *actual_size, int flags=0)
+{
+    int ret;
+    char *map = NULL;
+
+    void *vir_addr = NULL;
+    struct drm_prime_handle fd_args;
+    struct drm_mode_map_dumb mmap_arg;
+    struct drm_mode_destroy_dumb destory_arg;
+
+    struct drm_mode_create_dumb alloc_arg;
+
+    if ((drm_fd < 0) || (drmIoctl_func == NULL)) {
+        return NULL;
+    }
+
+    memset(&alloc_arg, 0, sizeof(alloc_arg));
+    alloc_arg.bpp = bpp;
+    alloc_arg.width = TexWidth;
+    alloc_arg.height = TexHeight;
+    alloc_arg.flags = flags;
+
+    //获取handle和size
+    ret = drmIoctl_func(drm_fd, DRM_IOCTL_MODE_CREATE_DUMB, &alloc_arg);
+    if (ret)
+    {
+        printf("failed to create dumb buffer: %s\n", strerror(errno));
+        return NULL;
+    }
+    if (handle != NULL)
+    {
+        *handle = alloc_arg.handle;
+    }
+    if (actual_size != NULL)
+    {
+        *actual_size = alloc_arg.size;
+    }
+
+    int pagesize = sysconf(_SC_PAGESIZE);
+#if 1
+    printf("pagesize is %d\n", pagesize);
+    printf("create width=%u, height=%u, bpp=%u, size=%lu dumb buffer\n", alloc_arg.width, alloc_arg.height, alloc_arg.bpp, (unsigned long)alloc_arg.size);
+    printf("out handle= %d\n", alloc_arg.handle);
+#endif
+    //获取fd
+    memset(&fd_args, 0, sizeof(fd_args));
+    fd_args.fd = -1;
+    fd_args.handle = alloc_arg.handle;
+    fd_args.flags = 0;
+
+    ret = drmIoctl_func(drm_fd, DRM_IOCTL_PRIME_HANDLE_TO_FD, &fd_args);
+    if (ret)
+    {
+        printf("rk-debug handle_to_fd failed ret=%d,err=%s, handle=%x \n", ret, strerror(errno), fd_args.handle);
+        return NULL;
+    }
+
+    if (fd != NULL)
+    {
+        *fd = fd_args.fd;
+    }
+
+    //获取虚拟地址
+    memset(&mmap_arg, 0, sizeof(mmap_arg));
+    mmap_arg.handle = alloc_arg.handle;
+
+    ret = drmIoctl_func(drm_fd, DRM_IOCTL_MODE_MAP_DUMB, &mmap_arg);
+    if (ret)
+    {
+        printf("failed to create map dumb: %s\n", strerror(errno));
+        vir_addr = NULL;
+        goto destory_dumb;
+    }
+
+    vir_addr = map = (char *)mmap(0, alloc_arg.size, PROT_READ | PROT_WRITE, MAP_SHARED, drm_fd, mmap_arg.offset);
+    if (map == MAP_FAILED)
+    {
+        printf("failed to mmap buffer: %s\n", strerror(errno));
+        vir_addr = NULL;
+        goto destory_dumb;
+    }
+    return vir_addr;
+
+destory_dumb:
+    memset(&destory_arg, 0, sizeof(destory_arg));
+    destory_arg.handle = alloc_arg.handle;
+    ret = drmIoctl_func(drm_fd, DRM_IOCTL_MODE_DESTROY_DUMB, &destory_arg);
+    return vir_addr;
+}
+
+int drm_buf_destroy(int buf_fd, int handle, void *drm_buf, size_t size)
+{
+    int ret = -1;
+
+    if ((drm_fd < 0) || (drmIoctl_func == NULL)) {
+        return -1;
+    }
+
+    if (drm_buf == NULL)
+    {
+        return -1;
+    }
+
+    munmap(drm_buf, size);
+
+    struct drm_mode_destroy_dumb destory_arg;
+    memset(&destory_arg, 0, sizeof(destory_arg));
+    destory_arg.handle = handle;
+
+    ret = drmIoctl_func(drm_fd, DRM_IOCTL_MODE_DESTROY_DUMB, &destory_arg);
+    if (ret)
+    {
+        printf("failed to destory dumb %d, error=%s\n", ret, strerror(errno));
+    }
+
+    if (buf_fd > 0)
+    {
+        close(buf_fd);
+    }
+
+    return ret;
+}
+
+uint32_t drm_buf_get_phy(int handle) {
+    struct drm_rockchip_gem_phys phys_arg;
+    phys_arg.handle = handle;
+    phys_arg.phy_addr = 0;
+
+    int ret = drmIoctl_func(drm_fd, DRM_IOCTL_ROCKCHIP_GEM_GET_PHYS, &phys_arg);
+    if (ret)
+        printf("failed to get phy address: %s\n", strerror(errno));
+
+    printf("get phys 0x%x\n", phys_arg.phy_addr);
+
+    return phys_arg.phy_addr;
+}
+
+__attribute__((constructor)) static int load_drm() {
+    drm_fd =  drm_init();
+
+    if (drm_fd < 0) {
+        return -1;
+    }
+
+    drm_handle = dlopen("libdrm.so", RTLD_LAZY);
+
+    if (!drm_handle) {
+        printf("[RKNN] Can not find libdrm.so\n");
+        drm_deinit(drm_fd);
+        return -1;
+    }
+
+    drmIoctl_func = (DRM_IOCTL)dlsym(drm_handle, "drmIoctl");
+
+    if (drmIoctl_func == NULL) {
+        dlclose(drm_handle);
+        drm_handle = NULL;
+        drm_deinit(drm_fd);
+        return -1;
+    }
+
+    return 0;
+}
+
+__attribute__((destructor)) static void unload_drm() {
+    if (drm_handle) {
+        dlclose(drm_handle);
+        drm_handle = NULL;
+    }
+
+    drm_deinit(drm_fd);
+    drm_fd = -1;
+}
+
+#if 0
+int main_(){
+  void *drm_buf = NULL;
+  int drm_fd = -1;
+  int out_fd;
+  unsigned int handle;
+  int width = 224;
+  int height = 224;
+  int channel = 3;
+  int size = width*height*channel;
+  int actual_size=0;
+  // DRM alloc buffer
+  while(1){
+    drm_fd = drm_init();
+
+    drm_buf = drm_buf_alloc(drm_fd,width,height,channel*8,&out_fd,&handle,&actual_size);
+    // unsigned char * buf = (unsigned char *) drm_buf;
+    // for(int i = 0;i<width*height;++i) {
+    //   printf("[%d] %d\n",i,(int)buf[i]);
+    // }
+    // printf("\n");
+
+
+    //free drm buffer
+    drm_buf_destroy(drm_fd,out_fd,handle,drm_buf,actual_size);
+  }
+  return 0;
+
+}
+#endif
+#endif
+
diff --git a/3rdparty/librga/Android/arm64-v8a/librga.a b/3rdparty/librga/Android/arm64-v8a/librga.a
index 4238845..fc1d634 100644
Binary files a/3rdparty/librga/Android/arm64-v8a/librga.a and b/3rdparty/librga/Android/arm64-v8a/librga.a differ
diff --git a/3rdparty/librga/Android/arm64-v8a/librga.so b/3rdparty/librga/Android/arm64-v8a/librga.so
index 32abbbb..5561ae2 100644
Binary files a/3rdparty/librga/Android/arm64-v8a/librga.so and b/3rdparty/librga/Android/arm64-v8a/librga.so differ
diff --git a/3rdparty/librga/Android/armeabi-v7a/librga.a b/3rdparty/librga/Android/armeabi-v7a/librga.a
index 580799d..72d366c 100644
Binary files a/3rdparty/librga/Android/armeabi-v7a/librga.a and b/3rdparty/librga/Android/armeabi-v7a/librga.a differ
diff --git a/3rdparty/librga/Android/armeabi-v7a/librga.so b/3rdparty/librga/Android/armeabi-v7a/librga.so
index 2edb51f..2d0b218 100644
Binary files a/3rdparty/librga/Android/armeabi-v7a/librga.so and b/3rdparty/librga/Android/armeabi-v7a/librga.so differ
diff --git a/3rdparty/librga/Linux/aarch64/librga.a b/3rdparty/librga/Linux/aarch64/librga.a
index 22de397..9cfc785 100644
Binary files a/3rdparty/librga/Linux/aarch64/librga.a and b/3rdparty/librga/Linux/aarch64/librga.a differ
diff --git a/3rdparty/librga/Linux/aarch64/librga.so b/3rdparty/librga/Linux/aarch64/librga.so
index 1e9b191..c895014 100644
Binary files a/3rdparty/librga/Linux/aarch64/librga.so and b/3rdparty/librga/Linux/aarch64/librga.so differ
diff --git a/3rdparty/librga/Linux/armhf/librga.a b/3rdparty/librga/Linux/armhf/librga.a
index b3deee9..b678d31 100644
Binary files a/3rdparty/librga/Linux/armhf/librga.a and b/3rdparty/librga/Linux/armhf/librga.a differ
diff --git a/3rdparty/librga/Linux/armhf/librga.so b/3rdparty/librga/Linux/armhf/librga.so
index 405a359..152ebf3 100644
Binary files a/3rdparty/librga/Linux/armhf/librga.so and b/3rdparty/librga/Linux/armhf/librga.so differ
diff --git a/3rdparty/librga/Linux/armhf_uclibc/librga.a b/3rdparty/librga/Linux/armhf_uclibc/librga.a
index cff21fc..82d078f 100644
Binary files a/3rdparty/librga/Linux/armhf_uclibc/librga.a and b/3rdparty/librga/Linux/armhf_uclibc/librga.a differ
diff --git a/3rdparty/librga/Linux/armhf_uclibc/librga.so b/3rdparty/librga/Linux/armhf_uclibc/librga.so
index a9714a6..af15b96 100644
Binary files a/3rdparty/librga/Linux/armhf_uclibc/librga.so and b/3rdparty/librga/Linux/armhf_uclibc/librga.so differ
diff --git a/3rdparty/rknpu1/Linux/aarch64/librknn_api.so b/3rdparty/rknpu1/Linux/aarch64/librknn_api.so
new file mode 100644
index 0000000..e962942
Binary files /dev/null and b/3rdparty/rknpu1/Linux/aarch64/librknn_api.so differ
diff --git a/3rdparty/rknpu1/Linux/armhf/librknn_api.so b/3rdparty/rknpu1/Linux/armhf/librknn_api.so
new file mode 100644
index 0000000..b6726b4
Binary files /dev/null and b/3rdparty/rknpu1/Linux/armhf/librknn_api.so differ
diff --git a/3rdparty/rknpu1/include/rknn_api.h b/3rdparty/rknpu1/include/rknn_api.h
new file mode 100644
index 0000000..3bb6e1e
--- /dev/null
+++ b/3rdparty/rknpu1/include/rknn_api.h
@@ -0,0 +1,550 @@
+/****************************************************************************
+*
+*    Copyright (c) 2017 - 2018 by Rockchip Corp.  All rights reserved.
+*
+*    The material in this file is confidential and contains trade secrets
+*    of Rockchip Corporation. This is proprietary information owned by
+*    Rockchip Corporation. No part of this work may be disclosed,
+*    reproduced, copied, transmitted, or used in any way for any purpose,
+*    without the express written permission of Rockchip Corporation.
+*
+*****************************************************************************/
+
+
+#ifndef _RKNN_RUNTIME_H
+#define _RKNN_RUNTIME_H
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+#include <stdint.h>
+
+/*
+    Definition of extended flag for rknn_init.
+*/
+/* set high priority context. */
+#define RKNN_FLAG_PRIOR_HIGH                    0x00000000
+
+/* set medium priority context */
+#define RKNN_FLAG_PRIOR_MEDIUM                  0x00000001
+
+/* set low priority context. */
+#define RKNN_FLAG_PRIOR_LOW                     0x00000002
+
+/* asynchronous mode.
+   when enable, rknn_outputs_get will not block for too long because it directly retrieves the result of
+   the previous frame which can increase the frame rate on single-threaded mode, but at the cost of
+   rknn_outputs_get not retrieves the result of the current frame.
+   in multi-threaded mode you do not need to turn this mode on. */
+#define RKNN_FLAG_ASYNC_MASK                    0x00000004
+
+/* collect performance mode.
+   when enable, you can get detailed performance reports via rknn_query(ctx, RKNN_QUERY_PERF_DETAIL, ...),
+   but it will reduce the frame rate. */
+#define RKNN_FLAG_COLLECT_PERF_MASK             0x00000008
+
+/*
+   save pre-compile model.
+*/
+#define RKNN_FLAG_PRECOMPILE_MASK               0x00000020
+
+/*
+    Error code returned by the RKNN API.
+*/
+#define RKNN_SUCC                               0       /* execute succeed. */
+#define RKNN_ERR_FAIL                           -1      /* execute failed. */
+#define RKNN_ERR_TIMEOUT                        -2      /* execute timeout. */
+#define RKNN_ERR_DEVICE_UNAVAILABLE             -3      /* device is unavailable. */
+#define RKNN_ERR_MALLOC_FAIL                    -4      /* memory malloc fail. */
+#define RKNN_ERR_PARAM_INVALID                  -5      /* parameter is invalid. */
+#define RKNN_ERR_MODEL_INVALID                  -6      /* model is invalid. */
+#define RKNN_ERR_CTX_INVALID                    -7      /* context is invalid. */
+#define RKNN_ERR_INPUT_INVALID                  -8      /* input is invalid. */
+#define RKNN_ERR_OUTPUT_INVALID                 -9      /* output is invalid. */
+#define RKNN_ERR_DEVICE_UNMATCH                 -10     /* the device is unmatch, please update rknn sdk
+                                                           and npu driver/firmware. */
+#define RKNN_ERR_INCOMPATILE_PRE_COMPILE_MODEL  -11     /* This RKNN model use pre_compile mode, but not compatible with current driver. */
+//add by chifred: for reporting optimization version bug info
+#define RKNN_ERR_INCOMPATILE_OPTIMIZATION_LEVEL_VERSION  -12     /* This RKNN model set optimization level, but not compatible with current driver. */
+#define RKNN_ERR_TARGET_PLATFORM_UNMATCH        -13     /* This RKNN model set target platform, but not compatible with current platform. */
+//chifred add end
+#define RKNN_ERR_NON_PRE_COMPILED_MODEL_ON_MINI_DRIVER -14  /* This RKNN model is not a pre-compiled model, but the npu driver is mini driver. */
+
+/*
+    Definition for tensor
+*/
+#define RKNN_MAX_DIMS                           16      /* maximum dimension of tensor. */
+#define RKNN_MAX_NAME_LEN                       256     /* maximum name lenth of tensor. */
+
+
+
+#ifdef __arm__
+typedef uint32_t rknn_context;
+#else
+typedef uint64_t rknn_context;
+#endif
+
+/*
+    The query command for rknn_query
+*/
+typedef enum _rknn_query_cmd {
+    RKNN_QUERY_IN_OUT_NUM = 0,                          /* query the number of input & output tensor. */
+    RKNN_QUERY_INPUT_ATTR,                              /* query the attribute of input tensor. */
+    RKNN_QUERY_OUTPUT_ATTR,                             /* query the attribute of output tensor. */
+    RKNN_QUERY_PERF_DETAIL,                             /* query the detail performance, need set
+                                                           RKNN_FLAG_COLLECT_PERF_MASK when call rknn_init. */
+    RKNN_QUERY_PERF_RUN,                                /* query the time of run. */
+    RKNN_QUERY_SDK_VERSION,                             /* query the sdk & driver version */
+    RKNN_QUERY_PRE_COMPILE,                             /* query the pre compile model */
+
+    RKNN_QUERY_CMD_MAX
+} rknn_query_cmd;
+
+/*
+    the tensor data type.
+*/
+typedef enum _rknn_tensor_type {
+    RKNN_TENSOR_FLOAT32 = 0,                            /* data type is float32. */
+    RKNN_TENSOR_FLOAT16,                                /* data type is float16. */
+    RKNN_TENSOR_INT8,                                   /* data type is int8. */
+    RKNN_TENSOR_UINT8,                                  /* data type is uint8. */
+    RKNN_TENSOR_INT16,                                  /* data type is int16. */
+
+    RKNN_TENSOR_TYPE_MAX
+} rknn_tensor_type;
+
+inline static const char *get_type_string(rknn_tensor_type type)
+{
+    switch (type)
+    {
+    case RKNN_TENSOR_FLOAT32:
+        return "FP32";
+    case RKNN_TENSOR_FLOAT16:
+        return "FP16";
+    case RKNN_TENSOR_INT8:
+        return "INT8";
+    case RKNN_TENSOR_UINT8:
+        return "UINT8";
+    case RKNN_TENSOR_INT16:
+        return "INT16";
+    default:
+        return "UNKNOW";
+    }
+}
+
+/*
+    the quantitative type.
+*/
+typedef enum _rknn_tensor_qnt_type {
+    RKNN_TENSOR_QNT_NONE = 0,                           /* none. */
+    RKNN_TENSOR_QNT_DFP,                                /* dynamic fixed point. */
+    RKNN_TENSOR_QNT_AFFINE_ASYMMETRIC,                  /* asymmetric affine. */
+
+    RKNN_TENSOR_QNT_MAX
+} rknn_tensor_qnt_type;
+
+inline static const char *get_qnt_type_string(rknn_tensor_qnt_type type)
+{
+    switch (type)
+    {
+    case RKNN_TENSOR_QNT_NONE:
+        return "NONE";
+    case RKNN_TENSOR_QNT_DFP:
+        return "DFP";
+    case RKNN_TENSOR_QNT_AFFINE_ASYMMETRIC:
+        return "AFFINE";
+    default:
+        return "UNKNOW";
+    }
+}
+
+/*
+    the tensor data format.
+*/
+typedef enum _rknn_tensor_format {
+    RKNN_TENSOR_NCHW = 0,                               /* data format is NCHW. */
+    RKNN_TENSOR_NHWC,                                   /* data format is NHWC. */
+
+    RKNN_TENSOR_FORMAT_MAX
+} rknn_tensor_format;
+
+inline static const char *get_format_string(rknn_tensor_format fmt)
+{
+    switch (fmt)
+    {
+    case RKNN_TENSOR_NCHW:
+        return "NCHW";
+    case RKNN_TENSOR_NHWC:
+        return "NHWC";
+    default:
+        return "UNKNOW";
+    }
+}
+
+/*
+    the information for RKNN_QUERY_IN_OUT_NUM.
+*/
+typedef struct _rknn_input_output_num {
+    uint32_t n_input;                                   /* the number of input. */
+    uint32_t n_output;                                  /* the number of output. */
+} rknn_input_output_num;
+
+/*
+    the information for RKNN_QUERY_INPUT_ATTR / RKNN_QUERY_OUTPUT_ATTR.
+*/
+typedef struct _rknn_tensor_attr {
+    uint32_t index;                                     /* input parameter, the index of input/output tensor,
+                                                           need set before call rknn_query. */
+
+    uint32_t n_dims;                                    /* the number of dimensions. */
+    uint32_t dims[RKNN_MAX_DIMS];                       /* the dimensions array. */
+    char name[RKNN_MAX_NAME_LEN];                       /* the name of tensor. */
+
+    uint32_t n_elems;                                   /* the number of elements. */
+    uint32_t size;                                      /* the bytes size of tensor. */
+
+    rknn_tensor_format fmt;                             /* the data format of tensor. */
+    rknn_tensor_type type;                              /* the data type of tensor. */
+    rknn_tensor_qnt_type qnt_type;                      /* the quantitative type of tensor. */
+    int8_t fl;                                          /* fractional length for RKNN_TENSOR_QNT_DFP. */
+    uint32_t zp;                                        /* zero point for RKNN_TENSOR_QNT_AFFINE_ASYMMETRIC. */
+    float scale;                                        /* scale for RKNN_TENSOR_QNT_AFFINE_ASYMMETRIC. */
+} rknn_tensor_attr;
+
+/*
+    the information for RKNN_QUERY_PERF_DETAIL.
+*/
+typedef struct _rknn_perf_detail {
+    char* perf_data;                                    /* the string pointer of perf detail. don't need free it by user. */
+    uint64_t data_len;                                  /* the string length. */
+} rknn_perf_detail;
+
+/*
+    the information for RKNN_QUERY_PERF_RUN.
+*/
+typedef struct _rknn_perf_run {
+    int64_t run_duration;                               /* real inference time (us) */
+} rknn_perf_run;
+
+/*
+    the information for RKNN_QUERY_SDK_VERSION.
+*/
+typedef struct _rknn_sdk_version {
+    char api_version[256];                              /* the version of rknn api. */
+    char drv_version[256];                              /* the version of rknn driver. */
+} rknn_sdk_version;
+
+/*
+   The flags of rknn_tensor_mem.
+*/
+typedef enum _rknn_tensor_mem_flags {
+    RKNN_TENSOR_MEMORY_FLAGS_UNKNOWN = 0,
+    RKNN_TENSOR_MEMORY_FLAGS_ALLOC_INSIDE = 1,           /*Used to mark in rknn_destroy_mem() whether it is necessary to release the "mem" pointer itself.
+                                                         If the flag RKNN_TENSOR_MEMORY_FLAGS_ALLOC_INSIDE is set, rknn_destroy_mem() will call free(mem).*/
+   
+} rknn_tensor_mem_flags;
+
+
+/*
+    the memory information of tensor.
+*/
+typedef struct _rknn_tensor_memory {
+    void*       logical_addr;                           /* the virtual address of tensor buffer. */
+    uint64_t    physical_addr;                          /* the physical address of tensor buffer. */
+    int32_t     fd;                                     /* the fd of tensor buffer. */
+    uint32_t    size;                                   /* the size of tensor buffer. */
+    uint32_t    handle;                                 /* the handle tensor buffer. */
+    void *      priv_data;                              /* the data which is reserved. */
+    uint64_t reserved_flag;                             /* the flag which is reserved. */
+} rknn_tensor_mem;
+
+/*
+    the input information for rknn_input_set.
+*/
+typedef struct _rknn_input {
+    uint32_t index;                                     /* the input index. */
+    void* buf;                                          /* the input buf for index. */
+    uint32_t size;                                      /* the size of input buf. */
+    uint8_t pass_through;                               /* pass through mode.
+                                                           if TRUE, the buf data is passed directly to the input node of the rknn model
+                                                                    without any conversion. the following variables do not need to be set.
+                                                           if FALSE, the buf data is converted into an input consistent with the model
+                                                                     according to the following type and fmt. so the following variables
+                                                                     need to be set.*/
+    rknn_tensor_type type;                              /* the data type of input buf. */
+    rknn_tensor_format fmt;                             /* the data format of input buf.
+                                                           currently the internal input format of NPU is NCHW by default.
+                                                           so entering NCHW data can avoid the format conversion in the driver. */
+} rknn_input;
+
+/*
+    the output information for rknn_outputs_get.
+*/
+typedef struct _rknn_output {
+    uint8_t want_float;                                 /* want transfer output data to float */
+    uint8_t is_prealloc;                                /* whether buf is pre-allocated.
+                                                           if true, the following variables need to be set.
+                                                           if false, The following variables do not need to be set. */
+    uint32_t index;                                     /* the output index. */
+    void* buf;                                          /* the output buf for index.
+                                                           when is_prealloc = FALSE and rknn_outputs_release called,
+                                                           this buf pointer will be free and don't use it anymore. */
+    uint32_t size;                                      /* the size of output buf. */
+} rknn_output;
+
+/*
+    the extend information for rknn_run.
+*/
+typedef struct _rknn_run_extend {
+    uint64_t frame_id;                                  /* output parameter, indicate current frame id of run. */
+} rknn_run_extend;
+
+/*
+    the extend information for rknn_outputs_get.
+*/
+typedef struct _rknn_output_extend {
+    uint64_t frame_id;                                  /* output parameter, indicate the frame id of outputs, corresponds to
+                                                           struct rknn_run_extend.frame_id.*/
+} rknn_output_extend;
+
+/*
+    the information for RKNN_QUERY_RKNN_PRECOMPILE.
+*/
+typedef struct _rknn_precompile {
+    void* model_data;                                   /* the pointer of precompile model. don't need free it by user. */
+    uint32_t data_len;                                  /* the model length. */
+} rknn_precompile;
+
+/*  rknn_init
+
+    initial the context and load the rknn model.
+
+    input:
+        rknn_context* context       the pointer of context handle.
+        void* model                 pointer to the rknn model.
+        uint32_t size               the size of rknn model.
+        uint32_t flag               extend flag, see the define of RKNN_FLAG_XXX_XXX.
+    return:
+        int                         error code.
+*/
+int rknn_init(rknn_context* context, void* model, uint32_t size, uint32_t flag);
+
+
+/*  rknn_destroy
+
+    unload the rknn model and destroy the context.
+
+    input:
+        rknn_context context        the handle of context.
+    return:
+        int                         error code.
+*/
+int rknn_destroy(rknn_context context);
+
+
+/*  rknn_query
+
+    query the information about model or others. see rknn_query_cmd.
+
+    input:
+        rknn_context context        the handle of context.
+        rknn_query_cmd cmd          the command of query.
+        void* info                  the buffer point of information.
+        uint32_t size               the size of information.
+    return:
+        int                         error code.
+*/
+int rknn_query(rknn_context context, rknn_query_cmd cmd, void* info, uint32_t size);
+
+
+/*  rknn_inputs_set
+
+    set inputs information by input index of rknn model.
+    inputs information see rknn_input.
+
+    input:
+        rknn_context context        the handle of context.
+        uint32_t n_inputs           the number of inputs.
+        rknn_input inputs[]         the arrays of inputs information, see rknn_input.
+    return:
+        int                         error code
+*/
+int rknn_inputs_set(rknn_context context, uint32_t n_inputs, rknn_input inputs[]);
+
+
+/*  rknn_inputs_map
+
+    map inputs tensor memory information by input index of rknn model.
+    inputs information see rknn_input.
+
+    input:
+        rknn_context context        the handle of context.
+        uint32_t n_inputs           the number of inputs.
+        rknn_tensor_mem mem[]       the array of tensor memory information
+    return:
+        int                         error code
+*/
+int rknn_inputs_map(rknn_context context, uint32_t n_inputs, rknn_tensor_mem mem[]);
+
+
+/*  rknn_inputs_sync
+
+    synchronize inputs tensor buffer by input index of rknn model.
+
+    input:
+        rknn_context context        the handle of context.
+        uint32_t n_inputs           the number of inputs.
+        rknn_tensor_mem mem[]       the array of tensor memory information
+    return:
+        int                         error code
+*/
+int rknn_inputs_sync(rknn_context context, uint32_t n_inputs, rknn_tensor_mem mem[]);
+
+
+/*  rknn_inputs_unmap
+
+    unmap inputs tensor memory information by input index of rknn model.
+    inputs information see rknn_input.
+
+    input:
+        rknn_context context        the handle of context.
+        uint32_t n_inputs           the number of inputs.
+        rknn_tensor_mem mem[]       the array of tensor memory information
+    return:
+        int                         error code
+*/
+int rknn_inputs_unmap(rknn_context context, uint32_t n_inputs, rknn_tensor_mem mem[]);
+
+
+/*  rknn_run
+
+    run the model to execute inference.
+
+    input:
+        rknn_context context        the handle of context.
+        rknn_run_extend* extend     the extend information of run.
+    return:
+        int                         error code.
+*/
+int rknn_run(rknn_context context, rknn_run_extend* extend);
+
+
+/*  rknn_outputs_get
+
+    wait the inference to finish and get the outputs.
+    this function will block until inference finish.
+    the results will set to outputs[].
+
+    input:
+        rknn_context context        the handle of context.
+        uint32_t n_outputs          the number of outputs.
+        rknn_output outputs[]       the arrays of output, see rknn_output.
+        rknn_output_extend*         the extend information of output.
+    return:
+        int                         error code.
+*/
+int rknn_outputs_get(rknn_context context, uint32_t n_outputs, rknn_output outputs[], rknn_output_extend* extend);
+
+
+/*  rknn_outputs_release
+
+    release the outputs that get by rknn_outputs_get.
+    after called, the rknn_output[x].buf get from rknn_outputs_get will
+    also be free when rknn_output[x].is_prealloc = FALSE.
+
+    input:
+        rknn_context context        the handle of context.
+        uint32_t n_ouputs           the number of outputs.
+        rknn_output outputs[]       the arrays of output.
+    return:
+        int                         error code
+*/
+int rknn_outputs_release(rknn_context context, uint32_t n_ouputs, rknn_output outputs[]);
+
+
+/*  rknn_outputs_map
+
+    map the model output tensors memory information.
+    The difference between this function and "rknn_outputs_get" is
+    that it directly maps the model output tensor memory location to the user.
+
+    input:
+        rknn_context context        the handle of context.
+        uint32_t n_outputs          the number of outputs.
+        rknn_tensor_mem mem[]       the array of tensor memory information
+    return:
+        int                         error code.
+*/
+int rknn_outputs_map(rknn_context context, uint32_t n_outputs, rknn_tensor_mem mem[]);
+
+/*  rknn_outputs_sync
+
+    synchronize the output tensors buffer to ensure cache cohenrency, wait the inference to finish.
+
+    input:
+        rknn_context context        the handle of context.
+        uint32_t n_outputs          the number of outputs.
+        rknn_tensor_mem mem[]       the array of tensor memory information
+    return:
+        int                         error code.
+*/
+int rknn_outputs_sync(rknn_context context, uint32_t n_outputs, rknn_tensor_mem mem[]);
+
+/*  rknn_outputs_unmap
+
+    unmap the outputs memory information that get by rknn_outputs_map.
+
+    input:
+        rknn_context context        the handle of context.
+        uint32_t n_ouputs           the number of outputs.
+        rknn_tensor_mem mem[]       the array of tensor memory information
+    return:
+        int                         error code
+*/
+int rknn_outputs_unmap(rknn_context context, uint32_t n_ouputs, rknn_tensor_mem mem[]);
+
+/*  rknn_create_mem (memory allocated inside)
+
+    Create tensor memory. This API require libdrm support!
+
+    input:
+        rknn_context ctx            the handle of context.
+        uint64_t size               the size of tensor buffer.
+    return:
+        rknn_tensor_mem             the pointer of tensor memory information.
+*/
+rknn_tensor_mem* rknn_create_mem(rknn_context ctx, uint64_t size);
+
+/*  rknn_destroy_mem (support allocate inside and outside)
+
+    destroy tensor memory.
+
+    input:
+        rknn_context ctx            the handle of context.
+        rknn_tensor_mem *mem        the pointer of tensor memory information.
+    return:
+        int                         error code
+*/
+int rknn_destroy_mem(rknn_context ctx, rknn_tensor_mem *mem);
+
+
+
+/*  rknn_set_io_mem
+
+    set the input and output tensors buffer.
+
+    input:
+        rknn_context ctx            the handle of context.
+        rknn_tensor_mem *mem        the array of tensor memory information.
+        rknn_tensor_attr *attr      the attribute of input or output tensor buffer.
+    return:
+        int                         error code.
+*/
+int rknn_set_io_mem(rknn_context ctx, rknn_tensor_mem *mem, rknn_tensor_attr *attr);
+
+#ifdef __cplusplus
+} //extern "C"
+#endif
+
+#endif  //_RKNN_RUNTIME_H
diff --git a/3rdparty/rknpu2/Android/arm64-v8a/librknnrt.so b/3rdparty/rknpu2/Android/arm64-v8a/librknnrt.so
index ab04870..2891aea 100644
Binary files a/3rdparty/rknpu2/Android/arm64-v8a/librknnrt.so and b/3rdparty/rknpu2/Android/arm64-v8a/librknnrt.so differ
diff --git a/3rdparty/rknpu2/Android/armeabi-v7a/librknnrt.so b/3rdparty/rknpu2/Android/armeabi-v7a/librknnrt.so
index 225b81d..0cbcb45 100644
Binary files a/3rdparty/rknpu2/Android/armeabi-v7a/librknnrt.so and b/3rdparty/rknpu2/Android/armeabi-v7a/librknnrt.so differ
diff --git a/3rdparty/rknpu2/Linux/aarch64/librknnrt.so b/3rdparty/rknpu2/Linux/aarch64/librknnrt.so
index 39e5cab..3329ebc 100644
Binary files a/3rdparty/rknpu2/Linux/aarch64/librknnrt.so and b/3rdparty/rknpu2/Linux/aarch64/librknnrt.so differ
diff --git a/3rdparty/rknpu2/Linux/armhf-uclibc/librknnmrt.a b/3rdparty/rknpu2/Linux/armhf-uclibc/librknnmrt.a
index 1c994a6..0c0a3ab 100644
Binary files a/3rdparty/rknpu2/Linux/armhf-uclibc/librknnmrt.a and b/3rdparty/rknpu2/Linux/armhf-uclibc/librknnmrt.a differ
diff --git a/3rdparty/rknpu2/Linux/armhf-uclibc/librknnmrt.so b/3rdparty/rknpu2/Linux/armhf-uclibc/librknnmrt.so
index fb743f8..a0556c0 100644
Binary files a/3rdparty/rknpu2/Linux/armhf-uclibc/librknnmrt.so and b/3rdparty/rknpu2/Linux/armhf-uclibc/librknnmrt.so differ
diff --git a/3rdparty/rknpu2/Linux/armhf/librknnrt.so b/3rdparty/rknpu2/Linux/armhf/librknnrt.so
index c55ef30..8e40b8a 100644
Binary files a/3rdparty/rknpu2/Linux/armhf/librknnrt.so and b/3rdparty/rknpu2/Linux/armhf/librknnrt.so differ
diff --git a/3rdparty/rknpu2/include/rknn_api.h b/3rdparty/rknpu2/include/rknn_api.h
index 9203a38..08fb960 100644
--- a/3rdparty/rknpu2/include/rknn_api.h
+++ b/3rdparty/rknpu2/include/rknn_api.h
@@ -74,6 +74,20 @@ extern "C" {
 /* default nice -19, this flag can disable default priority */
 #define RKNN_FLAG_DISABLE_PROC_HIGH_PRIORITY    0x00002000
 
+/* don't flush input buffer cache, the user must ensure that the input tensor has flushed the cache before calling rknn_run. 
+!!! Don't use this flags when you call rknn_inputs_set() to set input data. */
+#define RKNN_FLAG_DISABLE_FLUSH_INPUT_MEM_CACHE    0x00004000
+
+/* Don't invalid output buffer cache. 
+   Users cannot directly access output_mem->virt_addr, 
+   which will cause cache consistency problems. 
+   If you want to use output_mem->virt_addr, 
+   you must use rknn_mem_sync (ctx, mem, RKNN_MEMORY_SYNC_FROM_DEVICE) to flush the cache. 
+   This flags is generally used when the output data of the NPU is not accessed by the CPU, 
+   but is accessed by the GPU or RGA to reduce the time required to flush the cache. 
+   !!! Don't use this flags when you call rknn_outputs_get() to get output data.*/
+#define RKNN_FLAG_DISABLE_FLUSH_OUTPUT_MEM_CACHE    0x00008000
+
 /*
     Error code returned by the RKNN API.
 */
@@ -227,6 +241,7 @@ typedef enum _rknn_core_mask {
     RKNN_NPU_CORE_2 = 4,                                          /* run on NPU core 2. */
     RKNN_NPU_CORE_0_1 = RKNN_NPU_CORE_0 | RKNN_NPU_CORE_1,        /* run on NPU core 0 and core 1. */
     RKNN_NPU_CORE_0_1_2 = RKNN_NPU_CORE_0_1 | RKNN_NPU_CORE_2,    /* run on NPU core 0 and core 1 and core 2. */
+    RKNN_NPU_CORE_ALL = 0xffff,                                   /* auto choice, run on NPU cores depending on platform */
 
     RKNN_NPU_CORE_UNDEFINED,
 } rknn_core_mask;
@@ -351,6 +366,15 @@ typedef enum _rknn_tensor_mem_flags {
     RKNN_TENSOR_MEMORY_FLAGS_UNKNOWN
 } rknn_tensor_mem_flags;
 
+/*
+   The mode to sync cacheable rknn memory.
+*/
+typedef enum _rknn_mem_alloc_flags {
+    RKNN_FLAG_MEMORY_FLAGS_DEFAULT = 0 << 0, /* Same with RKNN_FLAG_MEMORY_CACHEABLE */
+    RKNN_FLAG_MEMORY_CACHEABLE  = 1 << 0, /* Create Cacheable memory. */
+    RKNN_FLAG_MEMORY_NON_CACHEABLE = 1 << 1, /* Create NON-Cacheable memory. */
+} rknn_mem_alloc_flags;
+
 /*
    The mode to sync cacheable rknn memory.
 */
@@ -529,6 +553,8 @@ int rknn_set_batch_core_num(rknn_context context, int core_num);
     RKNN_NPU_CORE_2: core 2 mode
     RKNN_NPU_CORE_0_1: combine core 0/1 mode
     RKNN_NPU_CORE_0_1_2: combine core 0/1/2 mode
+    RKNN_NPU_CORE_ALL: auto mode, select multiple npu cores to run depending on platform 
+
 
     input:
         rknn_context context        the handle of context.
@@ -656,6 +682,18 @@ rknn_tensor_mem* rknn_create_mem_from_mb_blk(rknn_context ctx, void *mb_blk, int
 */
 rknn_tensor_mem* rknn_create_mem(rknn_context ctx, uint32_t size);
 
+/*  rknn_create_mem2 (memory allocated inside)
+
+    create tensor memory.
+
+    input:
+        rknn_context ctx            the handle of context.
+        uint64_t size               the size of tensor buffer.
+        uint64_t alloc_flags              control the memory is cacheable
+    return:
+        rknn_tensor_mem             the pointer of tensor memory information.
+*/
+rknn_tensor_mem* rknn_create_mem2(rknn_context ctx, uint64_t size, uint64_t alloc_flags);
 
 /*  rknn_destroy_mem (support allocate inside and outside)
 
diff --git a/3rdparty/rknpu2/include/rknn_custom_op.h b/3rdparty/rknpu2/include/rknn_custom_op.h
index 65c482e..253263f 100644
--- a/3rdparty/rknpu2/include/rknn_custom_op.h
+++ b/3rdparty/rknpu2/include/rknn_custom_op.h
@@ -21,6 +21,11 @@ extern "C" {
 
 #include <stdint.h>
 
+/*
+    Error code returned by the RKNN Custom Operator API.
+*/
+#define RKNN_WARNING_SKIP_CUSTOM_OP_COMPUTE -14 /* if custom op init callback funtion return this code and op type is supported by RKNN, it will use RKNN implementation. */
+
 #define RKNN_CUSTOM_OP_MAX_STR_LEN 64
 #define RKNN_CUSTOM_OP_MAX_VALUE_LEN 32
 #define RKNN_CUSTOM_OP_EXPORT __attribute__((visibility("default")))
diff --git a/3rdparty/rknpu2/include/rknn_matmul_api.h b/3rdparty/rknpu2/include/rknn_matmul_api.h
index 5707493..26d6cc7 100644
--- a/3rdparty/rknpu2/include/rknn_matmul_api.h
+++ b/3rdparty/rknpu2/include/rknn_matmul_api.h
@@ -21,14 +21,32 @@ extern "C" {
 
 typedef rknn_context rknn_matmul_ctx;
 
-/*
-  the process data type of matmul
-*/
+typedef struct _rknn_quant_params
+{
+  char name[RKNN_MAX_NAME_LEN];
+
+  // matmul tensor scale
+  float*  scale;
+  int32_t scale_len;
+
+  // matmul tensor zero point
+  int32_t* zp;
+  int32_t  zp_len;
+
+} rknn_quant_params;
+
 typedef enum _rknn_matmul_type
 {
   RKNN_FLOAT16_MM_FLOAT16_TO_FLOAT32 = 1,
   RKNN_INT8_MM_INT8_TO_INT32         = 2,
+  RKNN_INT8_MM_INT8_TO_INT8          = 3,
+  RKNN_FLOAT16_MM_FLOAT16_TO_FLOAT16 = 4,
+  RKNN_FLOAT16_MM_INT8_TO_FLOAT32    = 5,
+  RKNN_FLOAT16_MM_INT8_TO_FLOAT16    = 6,
+  RKNN_FLOAT16_MM_INT4_TO_FLOAT32    = 7,
+  RKNN_FLOAT16_MM_INT4_TO_FLOAT16    = 8,
   RKNN_INT4_MM_INT4_TO_INT16         = 10,
+  RKNN_INT8_MM_INT4_TO_INT32         = 11,
 } rknn_matmul_type;
 
 inline static const char* get_matmul_type_string(rknn_matmul_type type)
@@ -38,8 +56,20 @@ inline static const char* get_matmul_type_string(rknn_matmul_type type)
     return "RKNN_FLOAT16_MM_FLOAT16_TO_FLOAT32";
   case RKNN_INT8_MM_INT8_TO_INT32:
     return "RKNN_INT8_MM_INT8_TO_INT32";
+  case RKNN_INT8_MM_INT8_TO_INT8:
+    return "RKNN_INT8_MM_INT8_TO_INT8";
+  case RKNN_FLOAT16_MM_FLOAT16_TO_FLOAT16:
+    return "RKNN_FLOAT16_MM_FLOAT16_TO_FLOAT16";
+  case RKNN_FLOAT16_MM_INT8_TO_FLOAT32:
+    return "RKNN_FLOAT16_MM_INT8_TO_FLOAT32";
+  case RKNN_FLOAT16_MM_INT8_TO_FLOAT16:
+    return "RKNN_FLOAT16_MM_INT8_TO_FLOAT16";
   case RKNN_INT4_MM_INT4_TO_INT16:
     return "RKNN_INT4_MM_INT4_TO_INT16";
+  case RKNN_FLOAT16_MM_INT4_TO_FLOAT32:
+    return "RKNN_FLOAT16_MM_INT4_TO_FLOAT32";
+  case RKNN_INT8_MM_INT4_TO_INT32:
+    return "RKNN_INT8_MM_INT4_TO_INT32";
   default:
     return "UNKNOW";
   }
@@ -60,6 +90,7 @@ typedef struct _rknn_matmul_tensor_attr
   // int8 : A, B
   // int32: C
   rknn_tensor_type type;
+
 } rknn_matmul_tensor_attr;
 
 typedef struct _rknn_matmul_io_attr
@@ -70,6 +101,16 @@ typedef struct _rknn_matmul_io_attr
   rknn_matmul_tensor_attr C;
 } rknn_matmul_io_attr;
 
+/*
+  matmul dynamic shape struct
+*/
+typedef struct _rknn_matmul_shape
+{
+  int32_t M;
+  int32_t K;
+  int32_t N;
+} rknn_matmul_shape;
+
 /*
   matmul information struct
  */
@@ -77,14 +118,13 @@ typedef struct rknn_matmul_info_t
 {
   int32_t M;
   int32_t K; // limit: RK3566/3568: int8 type must be aligned with 32byte, float16 type must be aligned with 16byte;
-             // RK3562: int8 type must be aligned with 32byte, float16 type must be aligned with 32byte;
-             // RK3588: int8 type must be aligned with 32byte, float16 type must be aligned with 32byte,
-             //         int4 type must be aligned with 32byte;
+             // RK3562:      int8 type must be aligned with 32byte, float16 type must be aligned with 32byte;
+             // RK3588/3576: int8 type must be aligned with 32byte, float16 type must be aligned with 32byte,
+             //              int4 type must be aligned with 32byte;
   int32_t N; // limit: RK3566/3568: int8 type must be aligned with 16byte, float16 type must be aligned with 8byte;
-             // RK3562: int8 type must be aligned with 16byte, float16 type must be aligned with 8byte;
-             // RK3588: int8 type must be aligned with 32byte, float16 type must be aligned with 16byte,
-             //         int4 type must be aligned with 64byte;
-
+             // RK3562:      int8 type must be aligned with 16byte, float16 type must be aligned with 8byte;
+             // RK3588/3576: int8 type must be aligned with 32byte, float16 type must be aligned with 16byte,
+             //              int4 type must be aligned with 64byte;
   // matmul data type
   // int4: int4(A) x int4(B) -> int16(C)
   // int8: int8(A) x int8(B) -> int32(C)
@@ -94,12 +134,27 @@ typedef struct rknn_matmul_info_t
   // matmul native layout for B
   // 0: normal layout
   // 1: native layout
-  int32_t B_layout;
+  int16_t B_layout;
+
+  // matmul quant type for B
+  // A and C only support per layer
+  // 0: per layer
+  // 1: per channel
+  int16_t B_quant_type;
 
   // matmul native layout for A and C
   // 0: normal layout
   // 1: native layout
-  int32_t AC_layout;
+  int16_t AC_layout;
+
+  // matmul quant type for A and C, only support 0
+  int16_t AC_quant_type;
+
+  // iommu domain id, each domain has 4GB of space
+  int32_t iommu_domain_id;
+
+  // reserved field
+  int8_t reserved[36];
 } rknn_matmul_info;
 
 /*  rknn_matmul_create
@@ -113,6 +168,23 @@ typedef struct rknn_matmul_info_t
 */
 int rknn_matmul_create(rknn_matmul_ctx* ctx, rknn_matmul_info* info, rknn_matmul_io_attr* io_attr);
 
+/*  rknn_matmul_create_dyn_shape
+
+    params:
+        rknn_matmul_ctx *ctx                the handle of context.
+        rknn_matmul_info *info              the matmal information.
+        int shape_num                       the supported shape number of matmul.
+        rknn_matmul_shape dynamic_shapes[]  the supported M,K,N shape struct array.
+        rknn_matmul_io_attr *io_attr        the array of inputs and output attribute
+    return:
+        int                                 error code
+*/
+/*
+  原来的info.M, K, N无效
+*/
+int rknn_matmul_create_dyn_shape(rknn_matmul_ctx* ctx, rknn_matmul_info* info, int shape_num,
+                                 rknn_matmul_shape dynamic_shapes[], rknn_matmul_io_attr io_attrs[]);
+
 /* rknn_matmul_set_io_mem
 
     params:
@@ -129,13 +201,12 @@ int rknn_matmul_create(rknn_matmul_ctx* ctx, rknn_matmul_info* info, rknn_matmul
       K max:   k <= 10240
       K limit: RK3566/3568: int8 type must be aligned with 32byte, float16 type must be aligned with 16byte;
                RK3562:      int8 type must be aligned with 32byte, float16 type must be aligned with 32byte;
-               RK3588:      int8 type must be aligned with 32byte, float16 type must be aligned with 32byte,
+               RK3588/3576: int8 type must be aligned with 32byte, float16 type must be aligned with 32byte,
                             int4 type must be aligned with 32byte;
       N limit: RK3566/3568: int8 type must be aligned with 16byte, float16 type must be aligned with 8byte;
                RK3562:      int8 type must be aligned with 16byte, float16 type must be aligned with 8byte;
-               RK3588:      int8 type must be aligned with 32byte, float16 type must be aligned with 16byte,
+               RK3588/3576: int8 type must be aligned with 32byte, float16 type must be aligned with 16byte,
                             int4 type must be aligned with 64byte;
-
     A shape: M x K
       normal layout: (M, K)
               [M1K1, M1K2, ..., M1Kk,
@@ -168,7 +239,7 @@ int rknn_matmul_create(rknn_matmul_ctx* ctx, rknn_matmul_info* info, rknn_matmul
                K9M2, K10M2, ..., K16M2,
                ...
                K(k-7)Mm, K(k-6)Mm, ..., KkMm]
-      for RK3588：
+      for RK3588/3576：
       int4:
       native layout: (K / 32, M, 32)
               [K1M1, K2M1,  ..., K32M1,
@@ -252,6 +323,30 @@ int rknn_matmul_create(rknn_matmul_ctx* ctx, rknn_matmul_info* info, rknn_matmul
                ...
                K(k-31)Nn, K(k-30)Nn, ..., KkNn]
       for RK3588：
+      when K > 8192, the B data will be split into T segments.
+      int T = std::ceil(K / 8192);
+      For example:  normal layout  -> native layout
+      K =  20488, N = 4096, T = 3, the data will be split into 3 segments.
+      subN = rknn_matmul_io_attr.B.dims[2];
+      subK = rknn_matmul_io_attr.B.dims[3];
+                                      (8196, 4096)          (4096 / subN, 8196 / subK, subN, subK)
+        (K, N) = (20488, 4096)  ->    (8196, 4096)    ->    (4096 / subN, 8196 / subK, subN, subK)
+                 normal layout        (4096, 4096)          (4096 / subN, 4096 / subK, subN, subK)
+                                     T normal layout                    T native layout
+      It is recommended to use the rknn_B_normal_layout_to_native_layout interface for direct data conversion.
+      for RK3576：
+      when K > 4096, the B data will be split into T segments.
+      int T = std::ceil(K / 4096);
+      For example:  normal layout  -> native layout
+      K =  10240, N = 2048, T = 3, the data will be split into 3 segments.
+      subN = rknn_matmul_io_attr.B.dims[2];
+      subK = rknn_matmul_io_attr.B.dims[3];
+                                      (4096, 2048)          (2048 / subN, 4096 / subK, subN, subK)
+        (K, N) = (10240, 2048)  ->    (4096, 2048)    ->    (2048 / subN, 4096 / subK, subN, subK)
+                 normal layout        (2048, 2048)          (2048 / subN, 2048 / subK, subN, subK)
+                                     T normal layout                    T native layout
+      It is recommended to use the rknn_B_normal_layout_to_native_layout interface for direct data conversion.
+      for RK3588/3576：
       int4:
       native layout: (N / 64, K / 32, 64, 32)
               [K1N1,  K2N1,  ..., K32N1,
@@ -334,6 +429,44 @@ int rknn_matmul_set_io_mem(rknn_matmul_ctx ctx, rknn_tensor_mem* mem, rknn_matmu
 */
 int rknn_matmul_set_core_mask(rknn_matmul_ctx context, rknn_core_mask core_mask);
 
+/*  rknn_matmul_set_quant_params
+
+    set quant params.(only support matmul type RKNN_INT8_MM_INT8_TO_INT8, RKNN_INT8_MM_INT8_TO_INT32)
+
+    input:
+        rknn_matmul_ctx context     the handle of context.
+        rknn_quant_params params    quant params.
+    return:
+        int                         error code.
+*/
+int rknn_matmul_set_quant_params(rknn_matmul_ctx context, rknn_quant_params* params);
+
+/*  rknn_matmul_get_quant_params
+
+    get per channel quant params.(only support matmul type RKNN_INT8_MM_INT8_TO_INT32)
+
+    input:
+        rknn_matmul_ctx context     the handle of context.
+        rknn_quant_params params    quant params.
+        float scale    get scale for user.
+    return:
+        int                         error code.
+*/
+int rknn_matmul_get_quant_params(rknn_matmul_ctx ctx, rknn_quant_params* params, float* scale);
+
+/*  rknn_matmul_set_dynamic_shape
+
+    set the matmul input/output shape. matmul will run under current input shape after rknn_matmul_set_dynamic_shape,
+    only support M dynamicly now.
+
+    input:
+        rknn_matmul_ctx ctx         the handle of context.
+        rknn_matmul_shape* shape    the M,K,N shape of matmul currently
+    return:
+        int                         error code.
+*/
+int rknn_matmul_set_dynamic_shape(rknn_matmul_ctx ctx, rknn_matmul_shape* shape);
+
 /*  rknn_matmul_run
 
     run the matmul in blocking mode
@@ -356,6 +489,24 @@ int rknn_matmul_run(rknn_matmul_ctx ctx);
  */
 int rknn_matmul_destroy(rknn_matmul_ctx ctx);
 
+/*  rknn_B_normal_layout_to_native_layout
+
+    change the B normal layout buffer to native layout buffer
+
+    params:
+        void* B_input               B normal layout buffer.
+        void* B_output              B native layout buffer.
+        int   K                     K
+        int   N                     N
+        int   subN                  subN
+        int   subK                  subK
+        rknn_matmul_type type       matmul type
+    return:
+        int                         error code.
+ */
+int rknn_B_normal_layout_to_native_layout(void* B_input, void* B_output, int K, int N, int subN, int subK,
+                                          rknn_matmul_type type);
+
 #ifdef __cplusplus
 } // extern "C"
 #endif
diff --git a/README.md b/README.md
index fd68739..2f70051 100644
--- a/README.md
+++ b/README.md
@@ -8,8 +8,9 @@
 
 `RKNN Model Zoo` is developed based on the RKNPU SDK toolchain and provides deployment examples for current mainstream algorithms. Include the process of `exporting the RKNN model` and using `Python API` and `CAPI` to infer the RKNN model.
 
-- Support `RK3562`, `RK3566`, `RK3568`, `RK3588` platforms. (`RV1103`, `RV1106` platforms support `mobilenet`, `yolov5`)
-- `RK1808`, `RK3399PRO`, `RV1109`, `RV1126` will be supported in next version. (For `yolov5/6/7/8`, `yolox`, `ppyoloe` demos, they are available in  [`v1.5.0`](https://github.com/airockchip/rknn_model_zoo/tree/v1.5.0), please switch to [`v1.5.0`](https://github.com/airockchip/rknn_model_zoo/tree/v1.5.0) to get them)
+- Support `RK3562`, `RK3566`, `RK3568`, `RK3588` ,  `RK3576`  platforms. 
+- Limited support `RV1103`, `RV1106` 
+- Support `RK1808`,  `RV1109`, `RV1126` platforms.
 
 
 
@@ -25,7 +26,7 @@
 
 In addition to exporting the model from the corresponding respository, the models file are available on https://console.zbox.filez.com/l/8ufwtG (key: rknn). 
 
-| Demo<br />(Clip to Description)                           | Algorithm Category         | Dtype support | Pretrain model<br />(Clip to download)                       |
+| Demo<br />                                                | Algorithm Category         | Dtype support | Pretrain model<br />                                         |
 | --------------------------------------------------------- | -------------------------- | ------------- | ------------------------------------------------------------ |
 | [mobilenet](./examples/mobilenet/README.md)               | Classification             | FP16/INT8     | [mobilenetv2-12.onnx](https://ftrg.zbox.filez.com/v2/delivery/data/95f00b0fc900458ba134f8b180b3f7a1/examples/MobileNet/mobilenetv2-12.onnx) |
 | [resnet](./examples/resnet/README.md)                     | Classification             | FP16/INT8     | [resnet50-v2-7.onnx](https://ftrg.zbox.filez.com/v2/delivery/data/95f00b0fc900458ba134f8b180b3f7a1/examples/ResNet/resnet50-v2-7.onnx) |
@@ -38,7 +39,7 @@ In addition to exporting the model from the corresponding respository, the model
 | [deeplabv3](./examples/deeplabv3/README.md)               | Image segmentation         | FP16/INT8     | [deeplab-v3-plus-mobilenet-v2.pb](https://ftrg.zbox.filez.com/v2/delivery/data/95f00b0fc900458ba134f8b180b3f7a1/examples/Deeplabv3/deeplab-v3-plus-mobilenet-v2.pb) |
 | [yolov5-seg](./examples/yolov5_seg/README.md)             | Image segmentation         | FP16/INT8     | [yolov5n-seg.onnx](https://ftrg.zbox.filez.com/v2/delivery/data/95f00b0fc900458ba134f8b180b3f7a1/examples/yolov5_seg/yolov5n-seg.onnx)<br />[yolov5s-seg.onnx](https://ftrg.zbox.filez.com/v2/delivery/data/95f00b0fc900458ba134f8b180b3f7a1/examples/yolov5_seg/yolov5s-seg.onnx)<br />[yolov5m-seg.onnx](https://ftrg.zbox.filez.com/v2/delivery/data/95f00b0fc900458ba134f8b180b3f7a1/examples/yolov5_seg/yolov5m-seg.onnx) |
 | [yolov8-seg](./examples/yolov8_seg/README.md)             | Image segmentation         | FP16/INT8     | [yolov8n-seg.onnx](https://ftrg.zbox.filez.com/v2/delivery/data/95f00b0fc900458ba134f8b180b3f7a1/examples/yolov8_seg/yolov8n-seg.onnx)<br />[yolov8s-seg.onnx](https://ftrg.zbox.filez.com/v2/delivery/data/95f00b0fc900458ba134f8b180b3f7a1/examples/yolov8_seg/yolov8s-seg.onnx)<br />[yolov8m-seg.onnx](https://ftrg.zbox.filez.com/v2/delivery/data/95f00b0fc900458ba134f8b180b3f7a1/examples/yolov8_seg/yolov8m-seg.onnx) |
-| [ppseg](./examples/ppseg/README.md)                       | Image segmentation         | FP16          | [pp_liteseg_cityscapes.onnx](https://ftzr.zbox.filez.com/v2/delivery/data/95f00b0fc900458ba134f8b180b3f7a1/examples/ppseg/pp_liteseg_cityscapes.onnx) |
+| [ppseg](./examples/ppseg/README.md)                       | Image segmentation         | FP16          | [pp_liteseg_cityscapes.onnx](https://ftzr.zbox.filez.com/v2/delivery/data/95f00b0fc900458ba134f8b180b3f7a1/examples/ppseg/pp_liteseg_cityscapes.onnx ) |
 | [RetinaFace](./examples/RetinaFace/README.md)             | Face key points            | INT8          | [RetinaFace_mobile320.onnx](https://ftrg.zbox.filez.com/v2/delivery/data/95f00b0fc900458ba134f8b180b3f7a1/examples/RetinaFace/RetinaFace_mobile320.onnx)<br />[RetinaFace_resnet50_320.onnx](https://ftrg.zbox.filez.com/v2/delivery/data/95f00b0fc900458ba134f8b180b3f7a1/examples/RetinaFace/RetinaFace_resnet50_320.onnx) |
 | [LPRNet](./examples/LPRNet/README.md)                     | Car Plate Recognition      | FP16/INT8     | [lprnet.onnx](https://ftrg.zbox.filez.com/v2/delivery/data/95f00b0fc900458ba134f8b180b3f7a1/examples/LPRNet/lprnet.onnx) |
 | [PPOCR-Det](./examples/PPOCR/PPOCR-Det/README.md)         | Text detection             | FP16/INT8     | [ppocrv4_det.onnx](https://ftrg.zbox.filez.com/v2/delivery/data/95f00b0fc900458ba134f8b180b3f7a1/examples/PPOCR/ppocrv4_det.onnx) |
@@ -51,44 +52,46 @@ In addition to exporting the model from the corresponding respository, the model
 
 ## Model performance benchmark(FPS)
 
-| demo             | model_name                   | inputs_shape            | dtype | RK3566<br />RK3568 | RK3562 | RK3588<br />@single_core |
-| ---------------- | ---------------------------- | ----------------------- | ----- | ------------------ | ------ | ------------------------ |
-| mobilenet        | mobilenetv2-12               | [1, 3, 224, 224]        | INT8  | 197.4              | 266.8  | 433.0                    |
-| resnet           | resnet50-v2-7                | [1, 3, 224, 224]        | INT8  | 40.6               | 54.5   | 108.6                    |
-| yolov5           | yolov5s_relu                 | [1, 3, 640, 640]        | INT8  | 26.7               | 31.6   | 63.3                     |
-|                  | yolov5n                      | [1, 3, 640, 640]        | INT8  | 41.6               | 43.8   | 68.1                     |
-|                  | yolov5s                      | [1, 3, 640, 640]        | INT8  | 19.9               | 22.7   | 42.5                     |
-|                  | yolov5m                      | [1, 3, 640, 640]        | INT8  | 8.7                | 10.6   | 19.3                     |
-| yolov6           | yolov6n                      | [1, 3, 640, 640]        | INT8  | 50.2               | 51.5   | 93.8                     |
-|                  | yolov6s                      | [1, 3, 640, 640]        | INT8  | 15.2               | 16.8   | 34.1                     |
-|                  | yolov6m                      | [1, 3, 640, 640]        | INT8  | 7.5                | 8.0    | 17.6                     |
-| yolov7           | yolov7-tiny                  | [1, 3, 640, 640]        | INT8  | 29.9               | 34.9   | 69.7                     |
-|                  | yolov7                       | [1, 3, 640, 640]        | INT8  | 4.7                | 5.5    | 10.9                     |
-| yolov8           | yolov8n                      | [1, 3, 640, 640]        | INT8  | 35.7               | 38.5   | 59.6                     |
-|                  | yolov8s                      | [1, 3, 640, 640]        | INT8  | 15.4               | 17.1   | 32.8                     |
-|                  | yolov8m                      | [1, 3, 640, 640]        | INT8  | 6.6                | 7.5    | 14.8                     |
-| yolox            | yolox_s                      | [1, 3, 640, 640]        | INT8  | 15.5               | 17.7   | 32.9                     |
-|                  | yolox_m                      | [1, 3, 640, 640]        | INT8  | 6.7                | 8.1    | 14.8                     |
-| ppyoloe          | ppyoloe_s                    | [1, 3, 640, 640]        | INT8  | 17.5               | 19.7   | 32.9                     |
-|                  | ppyoloe_m                    | [1, 3, 640, 640]        | INT8  | 7.9                | 8.3    | 16.2                     |
-| deeplabv3        | deeplab-v3-plus-mobilenet-v2 | [1, 513, 513, 1]        | INT8  | 10.7               | 20.7   | 34.4                     |
-| yolov5_seg       | yolov5n-seg                  | [1, 3, 640, 640]        | INT8  | 33.9               | 36.3   | 58.0                     |
-|                  | yolov5s-seg                  | [1, 3, 640, 640]        | INT8  | 15.3               | 17.2   | 32.6                     |
-|                  | yolov5m-seg                  | [1, 3, 640, 640]        | INT8  | 6.8                | 8.1    | 15.2                     |
-| yolov8_seg       | yolov8n-seg                  | [1, 3, 640, 640]        | INT8  | 29.1               | 30.7   | 49.1                     |
-|                  | yolov8s-seg                  | [1, 3, 640, 640]        | INT8  | 11.8               | 11.3   | 25.4                     |
-|                  | yolov8m-seg                  | [1, 3, 640, 640]        | INT8  | 5.2                | 6.1    | 11.6                     |
-| ppseg            | pp_liteseg_cityscapes        | [1, 3, 512, 512]        | FP16  | 2.6                | 4.6    | 13.0                     |
-| RetinaFace       | RetinaFace_mobile320         | [1, 3, 320, 320]        | INT8  | 142.5              | 279.5  | 234.7                    |
-|                  | RetinaFace_resnet50_320      | [1, 3, 320, 320]        | INT8  | 18.5               | 26.0   | 48.8                     |
-| LPRNet           | lprnet                       | [1, 3, 24, 94]          | INT8  | 58.2               | 119.7  | 204.4                    |
-| PPOCR-Det        | ppocrv4_det                  | [1, 3, 480, 480]        | INT8  | 24.4               | 27.5   | 43.0                     |
-| PPOCR-Rec        | ppocrv4_rec                  | [1, 3, 48, 320]         | FP16  | 20.0               | 45.1   | 35.7                     |
-| lite_transformer | lite-transformer-encoder-16  | embedding-256, token-16 | FP16  | 130.8              | 656.7  | 261.5                    |
-|                  | lite-transformer-decoder-16  | embedding-256, token-16 | FP16  | 114.3              | 151.3  | 164.0                    |
+| demo             | model_name                   | inputs_shape            | dtype | RK3566 RK3568 | RK3562 | RK3588 @single_core | RK3576 @single_core | RK3576 <br />@single_core @sparse_weight | RV1109 | RV1126 | RK1808 |
+| ---------------- | ---------------------------- | ----------------------- | ----- | ------------- | ------ | ------------------- | ------------------- | ---------------------------------------- | ------ | ------ | ------ |
+| mobilenet        | mobilenetv2-12               | [1, 3, 224, 224]        | INT8  | 197.4         | 266.8  | 433.0               | 452.3               | 483.9                                    | 213.5  | 316.5  | 168.6  |
+| resnet           | resnet50-v2-7                | [1, 3, 224, 224]        | INT8  | 40.6          | 54.5   | 108.6               | 97.4                | 129.9                                    | 24.5   | 36.4   | 37.0   |
+| yolov5           | yolov5s_relu                 | [1, 3, 640, 640]        | INT8  | 26.7          | 31.6   | 63.3                | 62.6                | 82.0                                     | 20.3   | 29.3   | 36.7   |
+|                  | yolov5n                      | [1, 3, 640, 640]        | INT8  | 41.6          | 43.8   | 68.1                | 104.4               | 112.2                                    | 36.4   | 53.5   | 61.0   |
+|                  | yolov5s                      | [1, 3, 640, 640]        | INT8  | 19.9          | 22.7   | 42.5                | 54.2                | 65.5                                     | 13.7   | 20.1   | 28.1   |
+|                  | yolov5m                      | [1, 3, 640, 640]        | INT8  | 8.7           | 10.6   | 19.3                | 23.0                | 31.5                                     | 5.8    | 8.5    | 13.1   |
+| yolov6           | yolov6n                      | [1, 3, 640, 640]        | INT8  | 50.2          | 51.5   | 93.8                | 98.6                | 136.6                                    | 37.7   | 56.8   | 66.4   |
+|                  | yolov6s                      | [1, 3, 640, 640]        | INT8  | 15.2          | 16.8   | 34.1                | 33.1                | 55.3                                     | 10.9   | 16.4   | 24.0   |
+|                  | yolov6m                      | [1, 3, 640, 640]        | INT8  | 7.5           | 8.0    | 17.6                | 17.0                | 27.8                                     | 5.7    | 8.3    | 11.4   |
+| yolov7           | yolov7-tiny                  | [1, 3, 640, 640]        | INT8  | 29.9          | 34.9   | 69.7                | 70.9                | 91.8                                     | 15.6   | 22.5   | 37.2   |
+|                  | yolov7                       | [1, 3, 640, 640]        | INT8  | 4.7           | 5.5    | 10.9                | 12.5                | 17.9                                     | 3.3    | 4.9    | 7.4    |
+| yolov8           | yolov8n                      | [1, 3, 640, 640]        | INT8  | 35.7          | 38.5   | 59.6                | 79.5                | 95.6                                     | 24.1   | 36.0   | 41.9   |
+|                  | yolov8s                      | [1, 3, 640, 640]        | INT8  | 15.4          | 17.1   | 32.8                | 38.7                | 52.4                                     | 9.0    | 13.2   | 19.1   |
+|                  | yolov8m                      | [1, 3, 640, 640]        | INT8  | 6.6           | 7.5    | 14.8                | 15.9                | 23.5                                     | 3.9    | 5.8    | 9.1    |
+| yolox            | yolox_s                      | [1, 3, 640, 640]        | INT8  | 15.5          | 17.7   | 32.9                | 36.4                | 46.7                                     | 10.6   | 15.7   | 22.9   |
+|                  | yolox_m                      | [1, 3, 640, 640]        | INT8  | 6.7           | 8.1    | 14.8                | 16.5                | 23.2                                     | 4.7    | 6.8    | 10.5   |
+| ppyoloe          | ppyoloe_s                    | [1, 3, 640, 640]        | INT8  | 17.5          | 19.7   | 32.9                | 30.0                | 34.4                                     | 11.3   | 16.4   | 21.0   |
+|                  | ppyoloe_m                    | [1, 3, 640, 640]        | INT8  | 7.9           | 8.3    | 16.2                | 12.9                | 14.8                                     | 5.2    | 7.7    | 9.4    |
+| deeplabv3        | deeplab-v3-plus-mobilenet-v2 | [1, 513, 513, 1]        | INT8  | 10.7          | 20.7   | 34.4                | 38.1                | 42.5                                     | 10.3   | 13.1   | 4.4    |
+| yolov5_seg       | yolov5n-seg                  | [1, 3, 640, 640]        | INT8  | 33.9          | 36.3   | 58.0                | 82.4                | 92.2                                     | 28.7   | 41.9   | 49.6   |
+|                  | yolov5s-seg                  | [1, 3, 640, 640]        | INT8  | 15.3          | 17.2   | 32.6                | 39.5                | 51.1                                     | 9.7    | 14.0   | 22.4   |
+|                  | yolov5m-seg                  | [1, 3, 640, 640]        | INT8  | 6.8           | 8.1    | 15.2                | 17.2                | 25.1                                     | 4.7    | 6.9    | 10.7   |
+| yolov8_seg       | yolov8n-seg                  | [1, 3, 640, 640]        | INT8  | 29.1          | 30.7   | 49.1                | 64.5                | 78.0                                     | 18.6   | 27.8   | 32.7   |
+|                  | yolov8s-seg                  | [1, 3, 640, 640]        | INT8  | 11.8          | 11.3   | 25.4                | 29.3                | 39.7                                     | 6.7    | 9.8    | 14.5   |
+|                  | yolov8m-seg                  | [1, 3, 640, 640]        | INT8  | 5.2           | 6.1    | 11.6                | 12.1                | 18.1                                     | 3.1    | 4.6    | 6.8    |
+| ppseg            | ppseg_lite_1024x512          | [1, 3, 512, 512]        | INT8  | 2.6           | 4.6    | 13.0                | 8.7                 | 35.5                                     | 18.4   | 27.2   | 14.7   |
+| RetinaFace       | RetinaFace_mobile320         | [1, 3, 320, 320]        | INT8  | 142.5         | 279.5  | 234.7               | 416.0               | 396.8                                    | 146.3  | 210.1  | 242.2  |
+|                  | RetinaFace_resnet50_320      | [1, 3, 320, 320]        | INT8  | 18.5          | 26.0   | 48.8                | 47.3                | 70.4                                     | 14.7   | 20.9   | 24.2   |
+| LPRNet           | lprnet                       | [1, 3, 24, 94]          | INT8  | 58.2          | 119.7  | 204.4               | 130.2               | 130.6                                    | 30.6   | 47.8   | 30.1   |
+| PPOCR-Det        | ppocrv4_det                  | [1, 3, 480, 480]        | INT8  | 24.4          | 27.5   | 43.0                | 46.1                | 47.0                                     | 11.1   | 16.2   | 9.1    |
+| PPOCR-Rec        | ppocrv4_rec                  | [1, 3, 48, 320]         | FP16  | 20.0          | 45.1   | 35.7                | 55                  | 58.9                                     | 1.0    | 1.6    | 6.7    |
+| lite_transformer | lite-transformer-encoder-16  | embedding-256, token-16 | FP16  | 130.8         | 656.7  | 261.5               | 609.1               | 674.8                                    | 22.7   | 35.6   | 97.8   |
+|                  | lite-transformer-decoder-16  | embedding-256, token-16 | FP16  | 114.3         | 151.3  | 164.0               | 240                 | 341.8                                    | 49.0   | 66.3   | 114.9  |
 
 - This performance data are collected based on the maximum NPU frequency of each platform.
 - This performance data calculate the time-consuming of model inference. Does not include the time-consuming of pre-processing and post-processing.
+- RK3576 with sparse_weight referring to the performance when enabling the sparse weight for models 
+- Note: Models with sparse weight (via Kernel) should have improved performance, but may have accuracy drops depending on models.
 
 
 
@@ -98,23 +101,24 @@ For Linux develop board:
 
 ```sh
 ./build-linux.sh -t <target> -a <arch> -d <build_demo_name> [-b <build_type>] [-m]
-    -t : target (rk356x/rk3588/rv1106)
+    -t : target (rk356x/rk3588/rk3576/rv1106/rk1808/rv1126)
     -a : arch (aarch64/armhf)
     -d : demo name
     -b : build_type(Debug/Release)
     -m : enable address sanitizer, build_type need set to Debug
+Note: 'rk356x' represents rk3562/rk3566/rk3568, 'rv1106' represents rv1103/rv1106, 'rv1126' represents rv1109/rv1126
 
 # Here is an example for compiling yolov5 demo for 64-bit Linux RK3566.
 ./build-linux.sh -t rk356x -a aarch64 -d yolov5
 ```
 
-For Android develop board:
+For Android development board:
 
 ```sh
 # For Android develop boards, it's require to set path for Android NDK compilation tool path according to the user environment
 export ANDROID_NDK_PATH=~/opts/ndk/android-ndk-r18b
 ./build-android.sh -t <target> -a <arch> -d <build_demo_name> [-b <build_type>] [-m]
-    -t : target (rk356x/rk3588)
+    -t : target (rk356x/rk3588/rk3576)
     -a : arch (arm64-v8a/armeabi-v7a)
     -d : demo name
     -b : build_type (Debug/Release)
@@ -130,6 +134,7 @@ export ANDROID_NDK_PATH=~/opts/ndk/android-ndk-r18b
 
 | Version | Description                                                  |
 | ------- | ------------------------------------------------------------ |
+| 2.0.0   | Add new support for `RK3576` for all demo.<br />Full support for `RK1808`,  `RK1109`, `RK1126` platform. |
 | 1.6.0   | New demo release, including object detection, image segmentation, OCR, car plate detection&recognition etc.<br />Full support for `RK3566`, `RK3568`, `RK3588`, `RK3562` platforms.<br />Limited support for `RV1103`, `RV1106` platforms. |
 | 1.5.0   | Yolo detection demo release.                                 |
 
@@ -139,10 +144,11 @@ export ANDROID_NDK_PATH=~/opts/ndk/android-ndk-r18b
 
 All demos in `RKNN Model Zoo` are verified based on the latest RKNPU SDK. If using a lower version for verification, the inference performance and inference results may be wrong.
 
-| Version | RKNPU2 SDK | RKNPU1 SDK    |
-| ------- | ---------- | ------------- |
-| 1.6.0   | >=1.6.0    | - Coming soon |
-| 1.5.0   | >=1.5.0    | >=1.7.3       |
+| Version | RKNPU2 SDK | RKNPU1 SDK |
+| ------- | ---------- | ---------- |
+| 2.0.0   | >=2.0.0    | >=1.7.5    |
+| 1.6.0   | >=1.6.0    | -          |
+| 1.5.0   | >=1.5.0    | >=1.7.3    |
 
 
 
diff --git a/README_CN.md b/README_CN.md
index b1e617e..32416e5 100644
--- a/README_CN.md
+++ b/README_CN.md
@@ -8,9 +8,9 @@
 
 RKNN Model Zoo基于 RKNPU SDK 工具链开发, 提供了目前主流算法的部署例程. 例程包含导出RKNN模型, 使用 Python API, CAPI 推理 RKNN 模型的流程.
 
-- 例程支持 `RK3562`, `RK3566`, `RK3568`, `RK3588` 平台。(`RV1103`, `RV1106` 平台支持`mobilenet`, `yolov5` 例程)
-
-- 暂不支持 `RK1808`, `RK3399PRO`, `RV1109`, `RV1126` 平台. 预计于下个版本支持. (`yolov5/6/7/8`, `yolox`, `ppyoloe` 例程, 在[`v1.5.0`](https://github.com/airockchip/rknn_model_zoo/tree/v1.5.0)版本中已经支持, 有需求的用户可切回[`v1.5.0`](https://github.com/airockchip/rknn_model_zoo/tree/v1.5.0)版本获取对应的demo)
+- 支持 `RK3562`, `RK3566`, `RK3568`, `RK3588`, `RK3576` 平台。
+- 部分支持`RV1103`, `RV1106` 
+- 支持 `RK1808`, `RV1109`, `RV1126` 平台。
 
 
 
@@ -26,7 +26,7 @@ RKNN Model Zoo依赖 RKNN-Toolkit2 进行模型转换, 编译安卓demo时需要
 
 以下demo除了从对应的仓库导出模型, 也可从网盘 https://console.zbox.filez.com/l/8ufwtG (提取码: rknn) 下载模型文件.
 
-| Demo<br />(Clip to Description)                           | 算法类别   | Demo支持类型 | Pretrain model<br />(Clip to download)                       |
+| Demo<br />                                                | 算法类别   | Demo支持类型 | Pretrain model<br />                                         |
 | --------------------------------------------------------- | ---------- | ------------ | ------------------------------------------------------------ |
 | [mobilenet](./examples/mobilenet/README.md)               | 图像分类   | FP16/INT8    | [mobilenetv2-12.onnx](https://ftrg.zbox.filez.com/v2/delivery/data/95f00b0fc900458ba134f8b180b3f7a1/examples/MobileNet/mobilenetv2-12.onnx) |
 | [resnet](./examples/resnet/README.md)                     | 图像分类   | FP16/INT8    | [resnet50-v2-7.onnx](https://ftrg.zbox.filez.com/v2/delivery/data/95f00b0fc900458ba134f8b180b3f7a1/examples/ResNet/resnet50-v2-7.onnx) |
@@ -39,7 +39,7 @@ RKNN Model Zoo依赖 RKNN-Toolkit2 进行模型转换, 编译安卓demo时需要
 | [deeplabv3](./examples/deeplabv3/README.md)               | 图像分割   | FP16/INT8    | [deeplab-v3-plus-mobilenet-v2.pb](https://ftrg.zbox.filez.com/v2/delivery/data/95f00b0fc900458ba134f8b180b3f7a1/examples/Deeplabv3/deeplab-v3-plus-mobilenet-v2.pb) |
 | [yolov5-seg](./examples/yolov5_seg/README.md)             | 图像分割   | FP16/INT8    | [yolov5n-seg.onnx](https://ftrg.zbox.filez.com/v2/delivery/data/95f00b0fc900458ba134f8b180b3f7a1/examples/yolov5_seg/yolov5n-seg.onnx)<br />[yolov5s-seg.onnx](https://ftrg.zbox.filez.com/v2/delivery/data/95f00b0fc900458ba134f8b180b3f7a1/examples/yolov5_seg/yolov5s-seg.onnx)<br />[yolov5m-seg.onnx](https://ftrg.zbox.filez.com/v2/delivery/data/95f00b0fc900458ba134f8b180b3f7a1/examples/yolov5_seg/yolov5m-seg.onnx) |
 | [yolov8-seg](./examples/yolov8_seg/README.md)             | 图像分割   | FP16/INT8    | [yolov8n-seg.onnx](https://ftrg.zbox.filez.com/v2/delivery/data/95f00b0fc900458ba134f8b180b3f7a1/examples/yolov8_seg/yolov8n-seg.onnx)<br />[yolov8s-seg.onnx](https://ftrg.zbox.filez.com/v2/delivery/data/95f00b0fc900458ba134f8b180b3f7a1/examples/yolov8_seg/yolov8s-seg.onnx)<br />[yolov8m-seg.onnx](https://ftrg.zbox.filez.com/v2/delivery/data/95f00b0fc900458ba134f8b180b3f7a1/examples/yolov8_seg/yolov8m-seg.onnx) |
-| [ppseg](./examples/ppseg/README.md)                       | 图像分割   | FP16         | [pp_liteseg_cityscapes.onnx](https://ftzr.zbox.filez.com/v2/delivery/data/95f00b0fc900458ba134f8b180b3f7a1/examples/ppseg/pp_liteseg_cityscapes.onnx) |
+| [ppseg](./examples/ppseg/README.md)                       | 图像分割   | FP16         | [pp_liteseg_cityscapes.onnx](https://ftzr.zbox.filez.com/v2/delivery/data/95f00b0fc900458ba134f8b180b3f7a1/examples/ppseg/pp_liteseg_cityscapes.onnx ) |
 | [RetinaFace](./examples/RetinaFace/README.md)             | 人脸关键点 | INT8         | [RetinaFace_mobile320.onnx](https://ftrg.zbox.filez.com/v2/delivery/data/95f00b0fc900458ba134f8b180b3f7a1/examples/RetinaFace/RetinaFace_mobile320.onnx)<br />[RetinaFace_resnet50_320.onnx](https://ftrg.zbox.filez.com/v2/delivery/data/95f00b0fc900458ba134f8b180b3f7a1/examples/RetinaFace/RetinaFace_resnet50_320.onnx) |
 | [LPRNet](./examples/LPRNet/README.md)                     | 车牌识别   | FP16/INT8    | [lprnet.onnx](https://ftrg.zbox.filez.com/v2/delivery/data/95f00b0fc900458ba134f8b180b3f7a1/examples/LPRNet/lprnet.onnx) |
 | [PPOCR-Det](./examples/PPOCR/PPOCR-Det/README.md)         | 文字检测   | FP16/INT8    | [ppocrv4_det.onnx](https://ftrg.zbox.filez.com/v2/delivery/data/95f00b0fc900458ba134f8b180b3f7a1/examples/PPOCR/ppocrv4_det.onnx) |
@@ -50,44 +50,46 @@ RKNN Model Zoo依赖 RKNN-Toolkit2 进行模型转换, 编译安卓demo时需要
 
 ## Model performance benchmark(FPS)
 
-| demo             | model_name                   | inputs_shape            | dtype | RK3566<br />RK3568 | RK3562 | RK3588<br />@single_core |
-| ---------------- | ---------------------------- | ----------------------- | ----- | ------------------ | ------ | ------ |
-| mobilenet        | mobilenetv2-12               | [1, 3, 224, 224]        | INT8  | 197.4              | 266.8  | 433.0                    |
-| resnet           | resnet50-v2-7                | [1, 3, 224, 224]        | INT8  | 40.6               | 54.5   | 108.6                    |
-| yolov5           | yolov5s_relu                 | [1, 3, 640, 640]        | INT8  | 26.7               | 31.6   | 63.3                     |
-|                  | yolov5n                      | [1, 3, 640, 640]        | INT8  | 41.6               | 43.8   | 68.1                     |
-|                  | yolov5s                      | [1, 3, 640, 640]        | INT8  | 19.9               | 22.7   | 42.5                     |
-|                  | yolov5m                      | [1, 3, 640, 640]        | INT8  | 8.7                | 10.6   | 19.3                     |
-| yolov6           | yolov6n                      | [1, 3, 640, 640]        | INT8  | 50.2               | 51.5   | 93.8                     |
-|                  | yolov6s                      | [1, 3, 640, 640]        | INT8  | 15.2               | 16.8   | 34.1                     |
-|                  | yolov6m                      | [1, 3, 640, 640]        | INT8  | 7.5                | 8.0    | 17.6                     |
-| yolov7           | yolov7-tiny                  | [1, 3, 640, 640]        | INT8  | 29.9               | 34.9   | 69.7                     |
-|                  | yolov7                       | [1, 3, 640, 640]        | INT8  | 4.7                | 5.5    | 10.9                     |
-| yolov8           | yolov8n                      | [1, 3, 640, 640]        | INT8  | 35.7               | 38.5   | 59.6                     |
-|                  | yolov8s                      | [1, 3, 640, 640]        | INT8  | 15.4               | 17.1   | 32.8                     |
-|                  | yolov8m                      | [1, 3, 640, 640]        | INT8  | 6.6                | 7.5    | 14.8                     |
-| yolox            | yolox_s                      | [1, 3, 640, 640]        | INT8  | 15.5               | 17.7   | 32.9                     |
-|                  | yolox_m                      | [1, 3, 640, 640]        | INT8  | 6.7                | 8.1    | 14.8                     |
-| ppyoloe          | ppyoloe_s                    | [1, 3, 640, 640]        | INT8  | 17.5               | 19.7   | 32.9                     |
-|                  | ppyoloe_m                    | [1, 3, 640, 640]        | INT8  | 7.9                | 8.3    | 16.2                     |
-| deeplabv3        | deeplab-v3-plus-mobilenet-v2 | [1, 513, 513, 1]        | INT8  | 10.7               | 20.7   | 34.4                     |
-| yolov5_seg       | yolov5n-seg                  | [1, 3, 640, 640]        | INT8  | 33.9               | 36.3   | 58.0                     |
-|                  | yolov5s-seg                  | [1, 3, 640, 640]        | INT8  | 15.3               | 17.2   | 32.6                     |
-|                  | yolov5m-seg                  | [1, 3, 640, 640]        | INT8  | 6.8                | 8.1    | 15.2                     |
-| yolov8_seg       | yolov8n-seg                  | [1, 3, 640, 640]        | INT8  | 29.1               | 30.7   | 49.1                     |
-|                  | yolov8s-seg                  | [1, 3, 640, 640]        | INT8  | 11.8               | 11.3   | 25.4                     |
-|                  | yolov8m-seg                  | [1, 3, 640, 640]        | INT8  | 5.2                | 6.1    | 11.6                     |
-| ppseg            | pp_liteseg_cityscapes | [1, 3, 512, 512]        | FP16  | 2.6                | 4.6    | 13.0                     |
-| RetinaFace       | RetinaFace_mobile320         | [1, 3, 320, 320]        | INT8  | 142.5              | 279.5  | 234.7                    |
-|                  | RetinaFace_resnet50_320      | [1, 3, 320, 320]        | INT8  | 18.5               | 26.0   | 48.8                     |
-| LPRNet           | lprnet                       | [1, 3, 24, 94]          | INT8  | 58.2               | 119.7  | 204.4                    |
-| PPOCR-Det        | ppocrv4_det                  | [1, 3, 480, 480]        | INT8  | 24.4               | 27.5   | 43.0                     |
-| PPOCR-Rec        | ppocrv4_rec                  | [1, 3, 48, 320]         | FP16  | 20.0               | 45.1   | 35.7                     |
-| lite_transformer | lite-transformer-encoder-16  | embedding-256, token-16 | FP16  | 130.8              | 656.7  | 261.5                    |
-|                  | lite-transformer-decoder-16  | embedding-256, token-16 | FP16  | 114.3              | 151.3  | 164.0                    |
+| demo             | model_name                   | inputs_shape            | dtype | RK3566<br />RK3568 | RK3562 | RK3588<br />@single_core | RK3576<br />@single_core | RK3576<br/>@single_core<br/>@sparse_weight | RV1109 | RV1126 | RK1808 |
+| ---------------- | ---------------------------- | ----------------------- | ----- | ------------------ | ------ | ------ | ------ | ------ | ------ | ------ | ------ |
+| mobilenet        | mobilenetv2-12               | [1, 3, 224, 224]        | INT8  | 197.4              | 266.8  | 433.0  | 452.3 | 483.9 | 213.5  | 316.5  | 168.6  |
+| resnet           | resnet50-v2-7                | [1, 3, 224, 224]        | INT8  | 40.6               | 54.5   | 108.6  | 97.4 | 129.9 | 24.5   | 36.4   | 37.0   |
+| yolov5           | yolov5s_relu                 | [1, 3, 640, 640]        | INT8  | 26.7               | 31.6   | 63.3   | 62.6 | 82.0 | 20.3   | 29.3   | 36.7   |
+|                  | yolov5n                      | [1, 3, 640, 640]        | INT8  | 41.6               | 43.8   | 68.1   | 104.4 | 112.2 | 36.4   | 53.5   | 61.0   |
+|                  | yolov5s                      | [1, 3, 640, 640]        | INT8  | 19.9               | 22.7   | 42.5   | 54.2 | 65.5 | 13.7   | 20.1   | 28.1   |
+|                  | yolov5m                      | [1, 3, 640, 640]        | INT8  | 8.7                | 10.6   | 19.3   | 23.0 | 31.5 | 5.8    | 8.5    | 13.1   |
+| yolov6           | yolov6n                      | [1, 3, 640, 640]        | INT8  | 50.2               | 51.5   | 93.8   | 98.6 | 136.6 | 37.7   | 56.8   | 66.4   |
+|                  | yolov6s                      | [1, 3, 640, 640]        | INT8  | 15.2               | 16.8   | 34.1   | 33.1 | 55.3 | 10.9   | 16.4   | 24.0   |
+|                  | yolov6m                      | [1, 3, 640, 640]        | INT8  | 7.5                | 8.0    | 17.6   | 17.0 | 27.8 | 5.7    | 8.3    | 11.4   |
+| yolov7           | yolov7-tiny                  | [1, 3, 640, 640]        | INT8  | 29.9               | 34.9   | 69.7   | 70.9 | 91.8 | 15.6   | 22.5   | 37.2   |
+|                  | yolov7                       | [1, 3, 640, 640]        | INT8  | 4.7                | 5.5    | 10.9   | 12.5 | 17.9 | 3.3    | 4.9    | 7.4    |
+| yolov8           | yolov8n                      | [1, 3, 640, 640]        | INT8  | 35.7               | 38.5   | 59.6   | 79.5 | 95.6 | 24.1   | 36.0   | 41.9   |
+|                  | yolov8s                      | [1, 3, 640, 640]        | INT8  | 15.4               | 17.1   | 32.8   | 38.7 | 52.4 | 9.0    | 13.2   | 19.1   |
+|                  | yolov8m                      | [1, 3, 640, 640]        | INT8  | 6.6                | 7.5    | 14.8   | 15.9 | 23.5 | 3.9    | 5.8    | 9.1    |
+| yolox            | yolox_s                      | [1, 3, 640, 640]        | INT8  | 15.5               | 17.7   | 32.9   | 36.4 | 46.7 | 10.6   | 15.7   | 22.9   |
+|                  | yolox_m                      | [1, 3, 640, 640]        | INT8  | 6.7                | 8.1    | 14.8   | 16.5 | 23.2 | 4.7    | 6.8    | 10.5   |
+| ppyoloe          | ppyoloe_s                    | [1, 3, 640, 640]        | INT8  | 17.5               | 19.7   | 32.9   | 30.0 | 34.4 | 11.3   | 16.4   | 21.0   |
+|                  | ppyoloe_m                    | [1, 3, 640, 640]        | INT8  | 7.9                | 8.3    | 16.2   | 12.9 | 14.8 | 5.2    | 7.7    | 9.4    |
+| deeplabv3        | deeplab-v3-plus-mobilenet-v2 | [1, 513, 513, 1]        | INT8  | 10.7               | 20.7   | 34.4   | 38.1 | 42.5 | 10.3   | 13.1   | 4.4    |
+| yolov5_seg       | yolov5n-seg                  | [1, 3, 640, 640]        | INT8  | 33.9               | 36.3   | 58.0   | 82.4 | 92.2 | 28.7   | 41.9   | 49.6   |
+|                  | yolov5s-seg                  | [1, 3, 640, 640]        | INT8  | 15.3               | 17.2   | 32.6   | 39.5 | 51.1 | 9.7    | 14.0   | 22.4   |
+|                  | yolov5m-seg                  | [1, 3, 640, 640]        | INT8  | 6.8                | 8.1    | 15.2   | 17.2 | 25.1 | 4.7    | 6.9    | 10.7   |
+| yolov8_seg       | yolov8n-seg                  | [1, 3, 640, 640]        | INT8  | 29.1               | 30.7   | 49.1   | 64.5 | 78.0 | 18.6   | 27.8   | 32.7   |
+|                  | yolov8s-seg                  | [1, 3, 640, 640]        | INT8  | 11.8               | 11.3   | 25.4   | 29.3 | 39.7 | 6.7    | 9.8    | 14.5   |
+|                  | yolov8m-seg                  | [1, 3, 640, 640]        | INT8  | 5.2                | 6.1    | 11.6   | 12.1 | 18.1 | 3.1    | 4.6    | 6.8    |
+| ppseg            | ppseg_lite_1024x512          | [1, 3, 512, 512]        | INT8  | 2.6                | 4.6    | 13.0   | 8.7 | 35.5 | 18.4   | 27.2   | 14.7   |
+| RetinaFace       | RetinaFace_mobile320         | [1, 3, 320, 320]        | INT8  | 142.5              | 279.5  | 234.7  | 416.0 | 396.8 | 146.3  | 210.1  | 242.2  |
+|                  | RetinaFace_resnet50_320      | [1, 3, 320, 320]        | INT8  | 18.5               | 26.0   | 48.8   | 47.3 | 70.4 | 14.7   | 20.9   | 24.2   |
+| LPRNet           | lprnet                       | [1, 3, 24, 94]          | INT8  | 58.2               | 119.7  | 204.4  | 130.2 | 130.6 | 30.6   | 47.8   | 30.1   |
+| PPOCR-Det        | ppocrv4_det                  | [1, 3, 480, 480]        | INT8  | 24.4               | 27.5   | 43.0   | 46.1 | 47.0 | 11.1   | 16.2   | 9.1   |
+| PPOCR-Rec        | ppocrv4_rec                  | [1, 3, 48, 320]         | FP16  | 20.0               | 45.1   | 35.7   | 55     | 58.9 | 1.0    | 1.6    | 6.7    |
+| lite_transformer | lite-transformer-encoder-16  | embedding-256, token-16 | FP16  | 130.8              | 656.7  | 261.5  | 609.1  | 674.8 | 22.7   | 35.6   | 97.8   |
+|                  | lite-transformer-decoder-16  | embedding-256, token-16 | FP16  | 114.3              | 151.3  | 164.0  | 240 | 341.8 | 49.0   | 66.3   | 114.9  |
 
 - 该性能数据基于各平台的最大NPU频率进行测试
 - 该性能数据指模型推理的耗时, 不包含前后处理的耗时
+- RK3576带sparse_weight是指模型开启kernel方向稀疏化 (按照4:2) 的性能
+- 注: 模型开启kernel的稀疏化 (按照4:2) 可以提升一些性能，精度可能只有些微损失（具体还需要看模型）
 
 
 
@@ -97,11 +99,12 @@ RKNN Model Zoo依赖 RKNN-Toolkit2 进行模型转换, 编译安卓demo时需要
 
 ```sh
 ./build-linux.sh -t <target> -a <arch> -d <build_demo_name> [-b <build_type>] [-m]
-    -t : target (rk356x/rk3588/rv1106)
+    -t : target (rk356x/rk3588/rk3576/rv1106/rk1808/rv1126)
     -a : arch (aarch64/armhf)
     -d : demo name
     -b : build_type(Debug/Release)
     -m : enable address sanitizer, build_type need set to Debug
+Note: 'rk356x' represents rk3562/rk3566/rk3568, 'rv1106' represents rv1103/rv1106, 'rv1126' represents rv1109/rv1126
 
 # 以编译64位Linux RK3566的yolov5 demo为例:
 ./build-linux.sh -t rk356x -a aarch64 -d yolov5
@@ -113,7 +116,7 @@ RKNN Model Zoo依赖 RKNN-Toolkit2 进行模型转换, 编译安卓demo时需要
 # 对于 Android 系统的开发板, 首先需要根据实际情况, 设置安卓NDK编译工具的路径
 export ANDROID_NDK_PATH=~/opts/ndk/android-ndk-r18b
 ./build-android.sh -t <target> -a <arch> -d <build_demo_name> [-b <build_type>] [-m]
-    -t : target (rk356x/rk3588)
+    -t : target (rk356x/rk3588/rk3576)
     -a : arch (arm64-v8a/armeabi-v7a)
     -d : demo name
     -b : build_type (Debug/Release)
@@ -129,6 +132,7 @@ export ANDROID_NDK_PATH=~/opts/ndk/android-ndk-r18b
 
 | 版本  | 说明                                                         |
 | ----- | ------------------------------------------------------------ |
+| 2.0.0 | 新增所有示例`RK3576`平台的支持 <br />支持`RK1808`,  `RK1109`, `RK1126`平台 |
 | 1.6.0 | 提供目标检测、图像分割、OCR、车牌识别等多个例程<br />支持`RK3562`, `RK3566`, `RK3568`, `RK3588`平台<br />部分支持`RV1103`, `RV1106`平台 |
 | 1.5.0 | 提供Yolo检测模型的demo                                       |
 
@@ -138,10 +142,11 @@ export ANDROID_NDK_PATH=~/opts/ndk/android-ndk-r18b
 
 RKNN Model Zoo 的例程基于当前最新的 RKNPU SDK 进行验证。若使用低版本的 RKNPU SDK 进行验证, 推理性能、推理结果可能会有差异。
 
-| 版本  | RKNPU2 SDK | RKNPU1 SDK    |
-| ----- | ---------- | ------------- |
-| 1.6.0 | >=1.6.0    | - Coming soon |
-| 1.5.0 | >=1.5.0    | >=1.7.3       |
+| 版本  | RKNPU2 SDK | RKNPU1 SDK |
+| ----- | ---------- | ---------- |
+| 2.0.0 | >=2.0.0    | >=1.7.5    |
+| 1.6.0 | >=1.6.0    | -          |
+| 1.5.0 | >=1.5.0    | >=1.7.3    |
 
 
 
diff --git a/asset/yolo_too_much_box.png b/asset/yolo_too_much_box.png
new file mode 100644
index 0000000..d0eb058
Binary files /dev/null and b/asset/yolo_too_much_box.png differ
diff --git a/asset/yolov5_without_sigmoid_out.png b/asset/yolov5_without_sigmoid_out.png
new file mode 100644
index 0000000..213ae06
Binary files /dev/null and b/asset/yolov5_without_sigmoid_out.png differ
diff --git a/build-android.sh b/build-android.sh
index 44ae4b5..349793d 100644
--- a/build-android.sh
+++ b/build-android.sh
@@ -50,7 +50,7 @@ done
 if [ -z ${TARGET_SOC} ]  || [ -z ${TARGET_ARCH} ] ||  [ -z ${BUILD_DEMO_NAME} ]; then
   echo "$0 -t <target> -a <arch> -d <build_demo_name> [-b <build_type>] [-m]"
   echo ""
-  echo "    -t : target (rk356x/rk3588)"
+  echo "    -t : target (rk356x/rk3588/rk3576)"
   echo "    -a : arch (arm64-v8a/armeabi-v7a)"
   echo "    -d : demo name"
   echo "    -b : build_type (Debug/Release)"
@@ -110,9 +110,12 @@ case ${TARGET_SOC} in
     rk3562)
         TARGET_SOC="rk356x"
         ;;
+    rk3576)
+        TARGET_SOC="rk3576"
+        ;;
     *)
         echo "Invalid target: ${TARGET_SOC}"
-        echo "Valid target: rk3562,rk3566,rk3568,rk3588"
+        echo "Valid target: rk3562,rk3566,rk3568,rk3588,rk3576"
         exit -1
         ;;
 esac
diff --git a/build-linux.sh b/build-linux.sh
index 0294fa6..2a27842 100644
--- a/build-linux.sh
+++ b/build-linux.sh
@@ -34,13 +34,13 @@ done
 if [ -z ${TARGET_SOC} ] || [ -z ${BUILD_DEMO_NAME} ]; then
   echo "$0 -t <target> -a <arch> -d <build_demo_name> [-b <build_type>] [-m]"
   echo ""
-  echo "    -t : target (rk356x/rk3588/rv1106)"
+  echo "    -t : target (rk356x/rk3588/rk3576/rv1106/rk1808/rv1126)"
   echo "    -a : arch (aarch64/armhf)"
   echo "    -d : demo name"
   echo "    -b : build_type(Debug/Release)"
   echo "    -m : enable address sanitizer, build_type need set to Debug"
   echo "such as: $0 -t rk3588 -a aarch64 -d mobilenet"
-  echo "Note: 'rk356x' represents rk3562/rk3566/rk3568, 'rv1106' represents rv1103/rv1106"
+  echo "Note: 'rk356x' represents rk3562/rk3566/rk3568, 'rv1106' represents rv1103/rv1106, 'rv1126' represents rv1109/rv1126"
   echo ""
   exit -1
 fi
@@ -50,6 +50,8 @@ if [[ -z ${GCC_COMPILER} ]];then
         echo "Please set GCC_COMPILER for $TARGET_SOC"
         echo "such as export GCC_COMPILER=~/opt/arm-rockchip830-linux-uclibcgnueabihf/bin/arm-rockchip830-linux-uclibcgnueabihf"
         exit
+    elif [[ ${TARGET_SOC} = "rv1109" || ${TARGET_SOC} = "rv1126" ]];then
+        GCC_COMPILER=arm-linux-gnueabihf
     else
         GCC_COMPILER=aarch64-linux-gnu
     fi
@@ -99,7 +101,7 @@ then
             echo "$name"
         fi
     done
-    echo "rv1106_rv1103 only support: mobilenet and yolov5"
+    echo "rv1106_rv1103 only support: mobilenet and yolov5/6/7/8/x"
     exit
 fi
 
@@ -122,9 +124,20 @@ case ${TARGET_SOC} in
     rk3562)
         TARGET_SOC="rk356x"
         ;;
+    rk3576)
+        TARGET_SOC="rk3576"
+        ;;
+    rk1808):
+        TARGET_SOC="rk1808"
+        ;;
+    rv1109)
+        ;;
+    rv1126)
+        TARGET_SOC="rv1126"
+        ;;
     *)
         echo "Invalid target: ${TARGET_SOC}"
-        echo "Valid target: rk3562,rk3566,rk3568,rk3588,rv1106,rv1103"
+        echo "Valid target: rk3562,rk3566,rk3568,rk3588,rk3576,rv1106,rv1103,rk1808,rv1109,rv1126"
         exit -1
         ;;
 esac
diff --git a/docs/Compilation_Environment_Setup_Guide.md b/docs/Compilation_Environment_Setup_Guide.md
new file mode 100644
index 0000000..a119fe1
--- /dev/null
+++ b/docs/Compilation_Environment_Setup_Guide.md
@@ -0,0 +1,100 @@
+# Compilation Environment Setup Guide
+
+It`s needed to set up a cross-compilation environment before compiling the C/C++ Demo of examples in this project on the x86 Linux system.
+
+
+## Android Platform
+
+When the target device is an `Android` system, use the `build-android.sh` script in the root directory to compile the C/C++ Demo of the specific model.  
+Before using this script to compile C/C++ Demo, please specify the path to the Android NDK through the environment variable `ANDROID_NDK_PATH`.
+
+### Download Android NDK
+
+*(If Android NDK is already installed on the system, please ignore this step)*
+
+1. Download the NDK through this link (it is recommended to download the r19c version)：https://dl.google.com/android/repository/android-ndk-r19c-linux-x86_64.zip
+2. Decompress the downloaded Android NDK. Remember this path, which will be used later when compiling C/C++ Demo. **Note: The directory name after decompression of the above NDK is `android-ndk-r19c`.**
+
+### Compile C/C++ Demo
+
+The command to compile C/C++ Demo is as follows:
+```shell
+export ANDROID_NDK_PATH=<android_ndk_path>
+
+./build-android.sh -t <TARGET_PLATFORM> -a <ARCH> -d <model_name>
+# for RK3588:
+./build-android.sh -t rk3588 -a arm64-v8a -d mobilenet
+# for RK3566:
+./build-android.sh -t rk3566 -a arm64-v8a -d mobilenet
+# for RK3568:
+./build-android.sh -t rk3568 -a arm64-v8a -d mobilenet
+```
+*Description:*
+- `<android_ndk_path>`: Specify the Android NDK path, for example: `~/opt/android-ndk-r19c`。
+- `<TARGET_PLATFORM>`: Specify the target platform, for example: `rk3566`, `rk3568`, `rk3588`. **Note: `RK1808`, `RV1109`, `RV1126`, `RV1103`, `RV1106` do not support the `Android` platform.**
+- `<ARCH>`: Specify the system architecture. To query the system architecture, refer to the following command:
+	```shell
+	# Query architecture. For Android, ['arm64-v8a' or 'armeabi-v7a'] should shown in log.
+	adb shell cat /proc/version
+	```
+- `model_name`: The model name. It is the folder name of each model in the examples directory.
+
+
+## Linux Platform
+
+When the target device is a `Linux` system, use the `build-linux.sh` script in the root directory to compile the C/C++ Demo of the specific model.  
+Before using this script to compile C/C++ Demo, please specify the path of the cross-compilation tool through the environment variable `GCC_COMPILER`.
+
+### Download cross-compilation tools
+
+*(If the cross-compilation tool is already installed on your system, please ignore this step)*
+
+1. Different system architectures rely on different cross-compilation tools.The following are download links for cross-compilation tools recommended for different system architectures.：
+   - aarch64: https://releases.linaro.org/components/toolchain/binaries/6.3-2017.05/aarch64-linux-gnu/gcc-linaro-6.3.1-2017.05-x86_64_aarch64-linux-gnu.tar.xz
+   - armhf: https://developer.arm.com/-/media/Files/downloads/gnu-a/8.3-2019.03/binrel/gcc-arm-8.3-2019.03-x86_64-arm-linux-gnueabihf.tar.xz?revision=e09a1c45-0ed3-4a8e-b06b-db3978fd8d56&rev=e09a1c450ed34a8eb06bdb3978fd8d56&hash=9C4F2E8255CB4D87EABF5769A2E65733
+   - armhf-uclibcgnueabihf(RV1103/RV1106): https://console.zbox.filez.com/l/H1fV9a (fetch code: rknn)
+2. Decompress the downloaded cross-compilation tool and remember the specific path, which will be used later during compilation.
+
+### Compile C/C++ Demo
+
+The command reference for compiling C/C++ Demo is as follows：
+```shell
+# go to the rknn_model_zoo root directory
+cd <rknn_model_zoo_root_path>
+
+# if GCC_COMPILER not found while building, please set GCC_COMPILER path
+export GCC_COMPILER=<GCC_COMPILER_PATH>
+
+./build-linux.sh -t <TARGET_PLATFORM> -a <ARCH> -d <model_name>
+
+# for RK3588
+./build-linux.sh -t rk3588 -a aarch64 -d mobilenet
+# for RK3566
+./build-linux.sh -t rk3566 -a aarch64 -d mobilenet
+# for RK3568
+./build-linux.sh -t rk3568 -a aarch64 -d mobilenet
+# for RK1808
+./build-linux.sh -t rk1808 -a aarch64 -d mobilenet
+# for RV1109
+./build-linux.sh -t rv1109 -a armhf -d mobilenet
+# for RV1126
+./build-linux.sh -t rv1126 -a armhf -d mobilenet
+# for RV1103
+./build-linux.sh -t rv1103 -a armhf -d mobilenet
+# for RV1106
+./build-linux.sh -t rv1106 -a armhf -d mobilenet
+```
+
+*Description:*
+- `<GCC_COMPILER_PATH>`: Specify the cross-compilation path. Different system architectures use different cross-compilation tools.
+    - `GCC_COMPILE_PATH` examples:
+        - aarch64: ~/tools/cross_compiler/arm/gcc-linaro-6.3.1-2017.05-x86_64_aarch64-linux-gnu/bin/aarch64-linux-gnu
+        - armhf: ~/tools/cross_compiler/arm/gcc-arm-8.3-2019.03-x86_64-arm-linux-gnueabihf/bin/arm-linux-gnueabihf
+        - armhf-uclibcgnueabihf(RV1103/RV1106): ~/tools/cross_compiler/arm/arm-rockchip830-linux-uclibcgnueabihf/bin/arm-rockchip830-linux-uclibcgnueabihf
+- `<TARGET_PLATFORM>`: Specify target platform. For example：`rk3588`. **Note: The target platforms currently supported by each model may be different, please refer to the `README.md` document in the specific model directory.**
+- `<ARCH>`: Specify the system architecture. To query the system architecture, refer to the following command: 
+  ```shell
+  # Query architecture. For Linux, ['aarch64' or 'armhf'] should shown in log.
+  adb shell cat /proc/version
+  ```
+- `model_name`: The model name. It is the folder name of each model in the examples directory.
\ No newline at end of file
diff --git a/docs/Compilation_Environment_Setup_Guide_CN.md b/docs/Compilation_Environment_Setup_Guide_CN.md
new file mode 100644
index 0000000..e2daaa0
--- /dev/null
+++ b/docs/Compilation_Environment_Setup_Guide_CN.md
@@ -0,0 +1,100 @@
+# 交叉编译环境搭建指南
+
+需要搭建好交叉编译环境，才可以在x86 Linux系统上编译本工程示例中的C/C++ Demo。
+
+
+## Android平台
+
+目标设备是Android系统时，使用根目录下的`build-android.sh`脚本编译具体模型的C/C++ Demo。  
+使用该脚本编译C/C++ Demo前需要先通过环境变量`ANDROID_NDK_PATH`指定Android NDK的路径。
+
+### 下载Android NDK
+
+*（如果系统中已经装有Android NDK，请忽略此步骤）*
+
+1. 通过此链接下载NDK（建议下载r19c版本）：https://dl.google.com/android/repository/android-ndk-r19c-linux-x86_64.zip
+2. 解压缩下载好的Android NDK。记住该路径，后面编译C/C++ Demo时会用到该路径。**注：上述NDK解压后的目录名为`android-ndk-r19c`。**
+
+### 编译C/C++ Demo
+
+编译C/C++ Demo的命令如下:
+```shell
+export ANDROID_NDK_PATH=<android_ndk_path>
+
+./build-android.sh -t <TARGET_PLATFORM> -a <ARCH> -d <model_name>
+# for RK3588:
+./build-android.sh -t rk3588 -a arm64-v8a -d mobilenet
+# for RK3566:
+./build-android.sh -t rk3566 -a arm64-v8a -d mobilenet
+# for RK3568:
+./build-android.sh -t rk3568 -a arm64-v8a -d mobilenet
+```
+*参数说明:*
+- `<android_ndk_path>`: 指定Android NDK路径，例如：`~/opt/android-ndk-r19c`。
+- `<TARGET_PLATFORM>`: 指定目标平台，例如`rk3566`, `rk3568`, `rk3588`。**注：`RK1808`, `RV1109`, `RV1126`, `RV1103`, `RV1106`不支持`Android`平台。**
+- `<ARCH>`: 指定系统架构。可以在目标设备执行如下命令查询系统架构:
+	```shell
+	# Query architecture. For Android, ['arm64-v8a' or 'armeabi-v7a'] should shown in log.
+	adb shell cat /proc/version
+	```
+- `model_name`: 模型名，即examples目录下各个模型所在的文件夹名。
+
+
+## Linux平台
+
+目标设备是`Linux`系统时，使用根目录下的`build-linux.sh`脚本编译具体模型的 C/C++ Demo。  
+使用该脚本编译C/C++ Demo前需要先通过环境变量`GCC_COMPILER`指定交叉编译工具的路径。
+
+### 下载交叉编译工具
+
+*（如果系统中已经装有交叉编译工具，请忽略此步骤）*
+
+1. 不同的系统架构，依赖不同的交叉编译工具。下面给出具体系统架构建议使用的交叉编译工具下载链接：
+   - aarch64: https://releases.linaro.org/components/toolchain/binaries/6.3-2017.05/aarch64-linux-gnu/gcc-linaro-6.3.1-2017.05-x86_64_aarch64-linux-gnu.tar.xz
+   - armhf: https://developer.arm.com/-/media/Files/downloads/gnu-a/8.3-2019.03/binrel/gcc-arm-8.3-2019.03-x86_64-arm-linux-gnueabihf.tar.xz?revision=e09a1c45-0ed3-4a8e-b06b-db3978fd8d56&rev=e09a1c450ed34a8eb06bdb3978fd8d56&hash=9C4F2E8255CB4D87EABF5769A2E65733
+   - armhf-uclibcgnueabihf(RV1103/RV1106): https://console.zbox.filez.com/l/H1fV9a (fetch code: rknn)
+2. 解压缩下载好的交叉编译工具，记住具体的路径，后面在编译时会用到该路径。
+
+### 编译C/C++ Demo
+
+编译C/C++ Demo的命令参考如下：
+```shell
+# go to the rknn_model_zoo root directory
+cd <rknn_model_zoo_root_path>
+
+# if GCC_COMPILER not found while building, please set GCC_COMPILER path
+export GCC_COMPILER=<GCC_COMPILER_PATH>
+
+./build-linux.sh -t <TARGET_PLATFORM> -a <ARCH> -d <model_name>
+
+# for RK3588
+./build-linux.sh -t rk3588 -a aarch64 -d mobilenet
+# for RK3566
+./build-linux.sh -t rk3566 -a aarch64 -d mobilenet
+# for RK3568
+./build-linux.sh -t rk3568 -a aarch64 -d mobilenet
+# for RK1808
+./build-linux.sh -t rk1808 -a aarch64 -d mobilenet
+# for RV1109
+./build-linux.sh -t rv1109 -a armhf -d mobilenet
+# for RV1126
+./build-linux.sh -t rv1126 -a armhf -d mobilenet
+# for RV1103
+./build-linux.sh -t rv1103 -a armhf -d mobilenet
+# for RV1106
+./build-linux.sh -t rv1106 -a armhf -d mobilenet
+```
+
+*参数说明:*
+- `<GCC_COMPILER_PATH>`: 指定交叉编译路径。不同的系统架构，所用的交叉编译工具并不相同。
+    - `GCC_COMPILE_PATH` 示例:
+        - aarch64: ~/tools/cross_compiler/arm/gcc-linaro-6.3.1-2017.05-x86_64_aarch64-linux-gnu/bin/aarch64-linux-gnu
+        - armhf: ~/tools/cross_compiler/arm/gcc-arm-8.3-2019.03-x86_64-arm-linux-gnueabihf/bin/arm-linux-gnueabihf
+        - armhf-uclibcgnueabihf(RV1103/RV1106): ~/tools/cross_compiler/arm/arm-rockchip830-linux-uclibcgnueabihf/bin/arm-rockchip830-linux-uclibcgnueabihf
+- `<TARGET_PLATFORM>`: 指定目标平台。例如：`rk3588`。**注：每个模型当前支持的目标平台可能有所不同，请参考具体模型目录下的`README.md`文档。**
+- `<ARCH>`: 指定系统架构。可以在目标设备执行如下命令查询系统架构: 
+  ```shell
+  # Query architecture. For Linux, ['aarch64' or 'armhf'] should shown in log.
+  adb shell cat /proc/version
+  ```
+- `model_name`: 模型名，即examples目录下各个模型所在的文件夹名。
diff --git a/examples/LPRNet/README.md b/examples/LPRNet/README.md
index 7c85131..302a3b2 100644
--- a/examples/LPRNet/README.md
+++ b/examples/LPRNet/README.md
@@ -2,19 +2,20 @@
 
 ## Table of contents
 
+- [Table of contents](#table-of-contents)
 - [1. Description](#1-description)
 - [2. Current Support Platform](#2-current-support-platform)
 - [3. Pretrained Model](#3-pretrained-model)
 - [4. Convert to RKNN](#4-convert-to-rknn)
 - [5. Python Demo](#5-python-demo)
 - [6. Android Demo](#6-android-demo)
-  - [6.1 Compile and Build](#61-compile-and-build)
-  - [6.2 Push demo files to device](#62-push-demo-files-to-device)
-  - [6.3 Run demo](#63-run-demo)
+    - [6.1 Compile and Build](#61-compile-and-build)
+    - [6.2 Push demo files to device](#62-push-demo-files-to-device)
+    - [6.3 Run demo](#63-run-demo)
 - [7. Linux Demo](#7-linux-demo)
-  - [7.1 Compile \&\& Build](#71-compile-and-build)
-  - [7.2 Push demo files to device](#72-push-demo-files-to-device)
-  - [7.3 Run demo](#73-run-demo)
+    - [7.1 Compile and Build](#71-compile-and-build)
+    - [7.2 Push demo files to device](#72-push-demo-files-to-device)
+    - [7.3 Run demo](#73-run-demo)
 - [8. Expected Results](#8-expected-results)
 
 
@@ -29,7 +30,7 @@ https://github.com/sirius-ai/LPRNet_Pytorch/
 
 ## 2. Current Support Platform
 
-RK3566, RK3588, RK3568, RK3562
+RK3566, RK3588, RK3568, RK3562, RK1808, RV1109, RV1126
 
 
 
@@ -54,10 +55,9 @@ cd model
 
 ```shell
 cd python
-python lprnet.py <onnx_model> <TARGET_PLATFORM> <dtype(optional)> <output_rknn_path(optional)>
+python convert.py <onnx_model> <TARGET_PLATFORM> <dtype(optional)> <output_rknn_path(optional)>
 
-# such as: 
-python lprnet.py ../model/lprnet.onnx rk3588
+# such as: python convert.py ../model/lprnet.onnx rk3588
 # output model will be saved as ../model/lprnet.rknn
 ```
 
@@ -65,48 +65,35 @@ python lprnet.py ../model/lprnet.onnx rk3588
 
 - `<onnx_model>`: Specify ONNX model path.
 - `<TARGET_PLATFORM>`: Specify NPU platform name. Support Platform refer [here](#2 Current Support Platform).
-- `<dtype>(optional)`: Specify as `i8` or `fp`. `i8` for doing quantization, `fp` for no quantization. Default is `i8`.
+- `<dtype>(optional)`: Specify as `i8`, `u8` or `fp`, `i8`/`u8` means to do quantization, `fp` means no to do quantization, default is `i8`/`u8`.
 - `<output_rknn_path>(optional)`: Specify save path for the RKNN model, default save in the same directory as ONNX model with name `lprnet.rknn`
 
 
 
 ## 5. Python Demo
 
+*Usage:*
 
+```shell
+cd python
+# Inference with RKNN model
+python lprnet.py --model_path <rknn_model> --target <TARGET_PLATFORM>
+```
+*Description:*
+- <TARGET_PLATFORM>: Specified as the NPU platform name. Such as 'rk3588'.
+- <rknn_model>: Specified as the model path.
 
-Please refer [Convert to RKNN](#4. Convert to RKNN). Executing the `lprnet.py`  will identify the model/test.jpg license plate. The expected results are as follows:
-
+*The expected results are as follows:*
 ```
 车牌识别结果: 湘F6CL03
 ```
 
-
-
 ## 6. Android Demo
+**Note: RK1808, RV1109, RV1126 does not support Android.**
 
 #### 6.1 Compile and Build
 
-*Usage:*
-
-```sh
-# go back to the rknn_model_zoo root directory
-cd ../../
-export ANDROID_NDK_PATH=<android_ndk_path>
-
-./build-android.sh -t <TARGET_PLATFORM> -a <ARCH> -d LPRNet
-
-# such as 
-./build-android.sh -t rk3588 -a arm64-v8a -d LPRNet
-```
-
-*Description:*
-- `<android_ndk_path>`: Specify Android NDK path.
-- `<TARGET_PLATFORM>`: Specify NPU platform name. Support Platform refer [here](#2 Current Support Platform).
-- `<ARCH>`: Specify device system architecture. To query device architecture, refer to the following command:
-	```shell
-	# Query architecture. For Android, ['arm64-v8a' or 'armeabi-v7a'] should shown in log.
-	adb shell cat /proc/version
-	```
+Please refer to the [Compilation_Environment_Setup_Guide](../../docs/Compilation_Environment_Setup_Guide.md#android-platform) document to setup a cross-compilation environment and complete the compilation of C/C++ Demo.
 
 #### 6.2 Push demo files to device
 
@@ -133,31 +120,7 @@ export LD_LIBRARY_PATH=./lib
 
 #### 7.1 Compile and Build
 
-*usage*
-
-```shell
-# go back to the rknn_model_zoo root directory
-cd ../../
-
-# if GCC_COMPILER not found while building, please set GCC_COMPILER path
-(optional)export GCC_COMPILER=<GCC_COMPILER_PATH>
-
-./build-linux.sh -t <TARGET_PLATFORM> -a <ARCH> -d LPRNet
-
-# such as 
-./build-linux.sh -t rk3588 -a aarch64 -d LPRNet
-```
-
-*Description:*
-
-- `<GCC_COMPILER_PATH>`: Specified as GCC_COMPILER path.
-- `<TARGET_PLATFORM>` : Specify NPU platform name. Support Platform refer [here](#2 Current Support Platform).
-- `<ARCH>`: Specify device system architecture. To query device architecture, refer to the following command: 
-  
-  ```shell
-  # Query architecture. For Linux, ['aarch64' or 'armhf'] should shown in log.
-  adb shell cat /proc/version
-  ```
+Please refer to the [Compilation_Environment_Setup_Guide](../../docs/Compilation_Environment_Setup_Guide.md#linux-platform) document to setup a cross-compilation environment and complete the compilation of C/C++ Demo.
 
 #### 7.2 Push demo files to device
 
diff --git a/examples/LPRNet/cpp/CMakeLists.txt b/examples/LPRNet/cpp/CMakeLists.txt
index 6f75e99..a0bc231 100644
--- a/examples/LPRNet/cpp/CMakeLists.txt
+++ b/examples/LPRNet/cpp/CMakeLists.txt
@@ -41,11 +41,18 @@ message(STATUS OpenCV_LIBS=${OpenCV_LIBS})
 
 set(CMAKE_INSTALL_RPATH "$ORIGIN/../lib")
 
+if (TARGET_SOC STREQUAL "rk1808" OR TARGET_SOC STREQUAL "rv1109" OR TARGET_SOC STREQUAL "rv1126")
+    set(lprnet_file rknpu1/lprnet.cc)
+else()
+    set(lprnet_file rknpu2/lprnet.cc)
+endif()
+
+
 file(GLOB SRCS ${CMAKE_CURRENT_SOURCE_DIR}/*.cc)
 
 add_executable(${PROJECT_NAME}
     main.cc
-    rknpu2/lprnet.cc
+    ${lprnet_file}
 )
 
 target_link_libraries(${PROJECT_NAME}
diff --git a/examples/LPRNet/cpp/rknpu1/lprnet.cc b/examples/LPRNet/cpp/rknpu1/lprnet.cc
new file mode 100644
index 0000000..a5d690b
--- /dev/null
+++ b/examples/LPRNet/cpp/rknpu1/lprnet.cc
@@ -0,0 +1,228 @@
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+#include <math.h>
+#include "lprnet.h"
+#include "common.h"
+#include "file_utils.h"
+#include "image_utils.h"
+#include "opencv2/opencv.hpp"
+
+static void dump_tensor_attr(rknn_tensor_attr *attr)
+{
+    printf("  index=%d, name=%s, n_dims=%d, dims=[%d, %d, %d, %d], n_elems=%d, size=%d, fmt=%s, type=%s, qnt_type=%s, "
+           "zp=%d, scale=%f\n",
+           attr->index, attr->name, attr->n_dims, attr->dims[3], attr->dims[2], attr->dims[1], attr->dims[0],
+           attr->n_elems, attr->size, get_format_string(attr->fmt), get_type_string(attr->type),
+           get_qnt_type_string(attr->qnt_type), attr->zp, attr->scale);
+}
+
+int init_lprnet_model(const char *model_path, rknn_app_context_t *app_ctx)
+{
+    int ret;
+    int model_len = 0;
+    char *model;
+    rknn_context ctx = 0;
+
+    // Load RKNN Model
+    model_len = read_data_from_file(model_path, &model);
+    if (model == NULL)
+    {
+        printf("load_model fail!\n");
+        return -1;
+    }
+
+    ret = rknn_init(&ctx, model, model_len, 0);
+    free(model);
+    if (ret < 0)
+    {
+        printf("rknn_init fail! ret=%d\n", ret);
+        return -1;
+    }
+
+    // Get Model Input Output Number
+    rknn_input_output_num io_num;
+    ret = rknn_query(ctx, RKNN_QUERY_IN_OUT_NUM, &io_num, sizeof(io_num));
+    if (ret != RKNN_SUCC)
+    {
+        printf("rknn_query fail! ret=%d\n", ret);
+        return -1;
+    }
+    printf("model input num: %d, output num: %d\n", io_num.n_input, io_num.n_output);
+
+    // Get Model Input Info
+    printf("input tensors:\n");
+    rknn_tensor_attr input_attrs[io_num.n_input];
+    memset(input_attrs, 0, sizeof(input_attrs));
+    for (int i = 0; i < io_num.n_input; i++)
+    {
+        input_attrs[i].index = i;
+        ret = rknn_query(ctx, RKNN_QUERY_INPUT_ATTR, &(input_attrs[i]), sizeof(rknn_tensor_attr));
+        if (ret != RKNN_SUCC)
+        {
+            printf("rknn_query fail! ret=%d\n", ret);
+            return -1;
+        }
+        dump_tensor_attr(&(input_attrs[i]));
+    }
+
+    // Get Model Output Info
+    printf("output tensors:\n");
+    rknn_tensor_attr output_attrs[io_num.n_output];
+    memset(output_attrs, 0, sizeof(output_attrs));
+    for (int i = 0; i < io_num.n_output; i++)
+    {
+        output_attrs[i].index = i;
+        ret = rknn_query(ctx, RKNN_QUERY_OUTPUT_ATTR, &(output_attrs[i]), sizeof(rknn_tensor_attr));
+        if (ret != RKNN_SUCC)
+        {
+            printf("rknn_query fail! ret=%d\n", ret);
+            return -1;
+        }
+        dump_tensor_attr(&(output_attrs[i]));
+    }
+
+    // Set to context
+    app_ctx->rknn_ctx = ctx;
+    app_ctx->io_num = io_num;
+    app_ctx->input_attrs = (rknn_tensor_attr *)malloc(io_num.n_input * sizeof(rknn_tensor_attr));
+    memcpy(app_ctx->input_attrs, input_attrs, io_num.n_input * sizeof(rknn_tensor_attr));
+    app_ctx->output_attrs = (rknn_tensor_attr *)malloc(io_num.n_output * sizeof(rknn_tensor_attr));
+    memcpy(app_ctx->output_attrs, output_attrs, io_num.n_output * sizeof(rknn_tensor_attr));
+
+    if (input_attrs[0].fmt == RKNN_TENSOR_NCHW)
+    {
+        printf("model is NCHW input fmt\n");
+        app_ctx->model_channel = input_attrs[0].dims[2];
+        app_ctx->model_height = input_attrs[0].dims[1];
+        app_ctx->model_width = input_attrs[0].dims[0];
+    }
+    else
+    {
+        printf("model is NHWC input fmt\n");
+        app_ctx->model_height = input_attrs[0].dims[2];
+        app_ctx->model_width = input_attrs[0].dims[1];
+        app_ctx->model_channel = input_attrs[0].dims[0];
+    }
+    printf("model input height=%d, width=%d, channel=%d\n",
+           app_ctx->model_height, app_ctx->model_width, app_ctx->model_channel);
+
+    return 0;
+}
+
+int release_lprnet_model(rknn_app_context_t *app_ctx)
+{
+    if (app_ctx->input_attrs != NULL)
+    {
+        free(app_ctx->input_attrs);
+        app_ctx->input_attrs = NULL;
+    }
+    if (app_ctx->output_attrs != NULL)
+    {
+        free(app_ctx->output_attrs);
+        app_ctx->output_attrs = NULL;
+    }
+    if (app_ctx->rknn_ctx != 0)
+    {
+        rknn_destroy(app_ctx->rknn_ctx);
+        app_ctx->rknn_ctx = 0;
+    }
+    return 0;
+}
+
+int inference_lprnet_model(rknn_app_context_t *app_ctx, image_buffer_t *src_img, lprnet_result *out_result)
+{
+    int ret;
+    rknn_input inputs[1];
+    rknn_output outputs[1];
+
+    memset(inputs, 0, sizeof(inputs));
+    memset(outputs, 0, sizeof(outputs));
+
+    // Pre Process
+    cv::Mat img_ori = cv::Mat(src_img->height, src_img->width, CV_8UC3, (uint8_t *)src_img->virt_addr);
+    cv::Mat img_pre;
+    cv::resize(img_ori, img_pre, cv::Size(94, 24));
+    cv::cvtColor(img_pre, img_pre, cv::COLOR_RGB2BGR);
+
+    // Set Input Data
+    inputs[0].index = 0;
+    inputs[0].type = RKNN_TENSOR_UINT8;
+    inputs[0].fmt = RKNN_TENSOR_NHWC;
+    inputs[0].size = app_ctx->model_width * app_ctx->model_height * app_ctx->model_channel;
+    inputs[0].buf = img_pre.data;
+
+    ret = rknn_inputs_set(app_ctx->rknn_ctx, 1, inputs);
+    if (ret < 0)
+    {
+        printf("rknn_input_set fail! ret=%d\n", ret);
+        return -1;
+    }
+
+    // Run
+    printf("rknn_run\n");
+    ret = rknn_run(app_ctx->rknn_ctx, nullptr);
+    if (ret < 0)
+    {
+        printf("rknn_run fail! ret=%d\n", ret);
+        return -1;
+    }
+
+    // Get Output
+    outputs[0].want_float = 1;
+    ret = rknn_outputs_get(app_ctx->rknn_ctx, 1, outputs, NULL);
+    if (ret < 0)
+    {
+        printf("rknn_outputs_get fail! ret=%d\n", ret);
+        return ret;
+    }
+
+    // Post Process
+    std::vector<int> no_repeat_blank_label{};
+    float prebs[18];
+    int pre_c;
+    for (int x = 0; x < 18; x++) // Traverse 18 license plate positions
+    {
+        float *ptr = (float *)outputs[0].buf;
+        float preb[68];
+        for (int y = 0; y < 68; y++) // Traverse 68 string positions
+        {
+            preb[y] = ptr[x];
+            ptr += 18;
+        }
+        int max_num_index = std::max_element(preb, preb + 68) - preb;
+        prebs[x] = max_num_index;
+    }
+
+    // Remove duplicates and blanks
+    pre_c = prebs[0];
+    if (pre_c != 67)
+    {
+        no_repeat_blank_label.push_back(pre_c);
+    }
+    for (int value : prebs)
+    {
+        if (value == 67 or value == pre_c)
+        {
+            if (value == 67 or value == pre_c)
+            {
+                pre_c = value;
+            }
+            continue;
+        }
+        no_repeat_blank_label.push_back(value);
+        pre_c = value;
+    }
+
+    // The license plate is converted into a string according to the dictionary
+    out_result->plate_name.clear();
+    for (int hh : no_repeat_blank_label)
+    {
+        out_result->plate_name += plate_code[hh];
+    }
+
+    // Remeber to release rknn output
+    rknn_outputs_release(app_ctx->rknn_ctx, 1, outputs);
+
+    return ret;
+}
\ No newline at end of file
diff --git a/examples/LPRNet/cpp/rknpu2/lprnet.cc b/examples/LPRNet/cpp/rknpu2/lprnet.cc
index 0c18d36..450c665 100644
--- a/examples/LPRNet/cpp/rknpu2/lprnet.cc
+++ b/examples/LPRNet/cpp/rknpu2/lprnet.cc
@@ -112,11 +112,6 @@ int init_lprnet_model(const char *model_path, rknn_app_context_t *app_ctx)
 
 int release_lprnet_model(rknn_app_context_t *app_ctx)
 {
-    if (app_ctx->rknn_ctx != 0)
-    {
-        rknn_destroy(app_ctx->rknn_ctx);
-        app_ctx->rknn_ctx = 0;
-    }
     if (app_ctx->input_attrs != NULL)
     {
         free(app_ctx->input_attrs);
@@ -127,6 +122,11 @@ int release_lprnet_model(rknn_app_context_t *app_ctx)
         free(app_ctx->output_attrs);
         app_ctx->output_attrs = NULL;
     }
+    if (app_ctx->rknn_ctx != 0)
+    {
+        rknn_destroy(app_ctx->rknn_ctx);
+        app_ctx->rknn_ctx = 0;
+    }
     return 0;
 }
 
@@ -181,11 +181,11 @@ int inference_lprnet_model(rknn_app_context_t *app_ctx, image_buffer_t *src_img,
     std::vector<int> no_repeat_blank_label{};
     float prebs[18];
     int pre_c;
-    for (int x = 0; x < 18; x++) //遍历十八个车牌位置
+    for (int x = 0; x < 18; x++) // Traverse 18 license plate positions
     {
         float *ptr = (float *)outputs[0].buf;
         float preb[68];
-        for (int y = 0; y < 68; y++) //遍历68个字符串位置
+        for (int y = 0; y < 68; y++) // Traverse 68 string positions
         {
             preb[y] = ptr[x];
             ptr += 18;
@@ -194,7 +194,7 @@ int inference_lprnet_model(rknn_app_context_t *app_ctx, image_buffer_t *src_img,
         prebs[x] = max_num_index;
     }
 
-    //去重复、去空白
+    // Remove duplicates and blanks
     pre_c = prebs[0];
     if (pre_c != 67)
     {
@@ -214,7 +214,7 @@ int inference_lprnet_model(rknn_app_context_t *app_ctx, image_buffer_t *src_img,
         pre_c = value;
     }
 
-    // 车牌按照字典转化为字符串并输出
+    // The license plate is converted into a string according to the dictionary
     out_result->plate_name.clear();
     for (int hh : no_repeat_blank_label)
     {
diff --git a/examples/LPRNet/model/dataset.txt b/examples/LPRNet/model/dataset.txt
new file mode 100644
index 0000000..0e8c5b6
--- /dev/null
+++ b/examples/LPRNet/model/dataset.txt
@@ -0,0 +1 @@
+test.jpg
\ No newline at end of file
diff --git a/examples/LPRNet/python/convert.py b/examples/LPRNet/python/convert.py
new file mode 100644
index 0000000..162b9b4
--- /dev/null
+++ b/examples/LPRNet/python/convert.py
@@ -0,0 +1,73 @@
+import sys
+from rknn.api import RKNN
+
+DATASET_PATH = '../model/dataset.txt'
+DEFAULT_RKNN_PATH = '../model/lprnet.rknn'
+DEFAULT_QUANT = True
+
+def parse_arg():
+    if len(sys.argv) < 3:
+        print("Usage: python3 {} onnx_model_path [platform] [dtype(optional)] [output_rknn_path(optional)]".format(sys.argv[0]));
+        print("       platform choose from [rk3562, rk3566, rk3568, rk3588, rk1808, rv1109, rv1126]")
+        print("       dtype choose from    [i8, fp] for [rk3562,rk3566,rk3568,rk3588]")
+        print("       dtype choose from    [u8, fp] for [rk1808,rv1109,rv1126]")
+        exit(1)
+
+    model_path = sys.argv[1]
+    platform = sys.argv[2]
+
+    do_quant = DEFAULT_QUANT
+    if len(sys.argv) > 3:
+        model_type = sys.argv[3]
+        if model_type not in ['i8', 'u8', 'fp']:
+            print("ERROR: Invalid model type: {}".format(model_type))
+            exit(1)
+        elif model_type in ['i8', 'u8']:
+            do_quant = True
+        else:
+            do_quant = False
+
+    if len(sys.argv) > 4:
+        output_path = sys.argv[4]
+    else:
+        output_path = DEFAULT_RKNN_PATH
+
+    return model_path, platform, do_quant, output_path
+
+if __name__ == '__main__':
+    model_path, platform, do_quant, output_path = parse_arg()
+
+    # Create RKNN object
+    rknn = RKNN(verbose=False)
+
+    # Pre-process config
+    print('--> Config model')
+    rknn.config(mean_values=[[127.5, 127.5, 127.5]], std_values=[[127.5, 127.5, 127.5]], target_platform=platform)
+    print('done')
+
+    # Load model
+    print('--> Loading model')
+    ret = rknn.load_onnx(model=model_path)
+    if ret != 0:
+        print('Load model failed!')
+        exit(ret)
+    print('done')
+
+    # Build model
+    print('--> Building model')
+    ret = rknn.build(do_quantization=do_quant, dataset=DATASET_PATH)
+    if ret != 0:
+        print('Build model failed!')
+        exit(ret)
+    print('done')
+
+    # Export rknn model
+    print('--> Export rknn model')
+    ret = rknn.export_rknn(output_path)
+    if ret != 0:
+        print('Export rknn model failed!')
+        exit(ret)
+    print('done')
+
+    # Release
+    rknn.release()
diff --git a/examples/LPRNet/python/export_onnx.py b/examples/LPRNet/python/export_onnx.py
index 1bc4794..73b14d3 100644
--- a/examples/LPRNet/python/export_onnx.py
+++ b/examples/LPRNet/python/export_onnx.py
@@ -12,7 +12,7 @@
 MODEL_PATH = MODEL_DIR + 'Final_LPRNet_model.pth'
 
 
-# 将maxpool3d转换成maxpool2d的类
+# Convert maxpool3d to the class of maxpool2d
 class maxpool_3d(nn.Module):
     def __init__(self, kernel_size, stride):
         super(maxpool_3d, self).__init__()
diff --git a/examples/LPRNet/python/lprnet.py b/examples/LPRNet/python/lprnet.py
index 1c77ed9..f48c65d 100644
--- a/examples/LPRNet/python/lprnet.py
+++ b/examples/LPRNet/python/lprnet.py
@@ -2,12 +2,9 @@
 import sys
 import cv2
 import numpy as np
+import argparse
 from rknn.api import RKNN
 
-DATASET_PATH = '../../../datasets/LPRNET/datasets.txt'
-DEFAULT_RKNN_PATH = '../model/lprnet.rknn'
-DEFAULT_QUANT = True
-
 CHARS = ['京', '沪', '津', '渝', '冀', '晋', '蒙', '辽', '吉', '黑',
      '苏', '浙', '皖', '闽', '赣', '鲁', '豫', '鄂', '湘', '粤',
      '桂', '琼', '川', '贵', '云', '藏', '陕', '甘', '青', '宁',
@@ -45,83 +42,40 @@ def decode(preds, CHARS):
         labels.append(lb)
     return labels, pred_labels
 
-
-def parse_arg():
-    if len(sys.argv) < 3:
-        print("Usage: python3 {} onnx_model_path [platform] [dtype(optional)] [output_rknn_path(optional)]".format(sys.argv[0]));
-        print("       platform choose from [rk3562,rk3566,rk3568,rk3588]")
-        print("       dtype choose from    [i8, fp]")
-        exit(1)
-
-    model_path = sys.argv[1]
-    platform = sys.argv[2]
-
-    do_quant = DEFAULT_QUANT
-    if len(sys.argv) > 3:
-        model_type = sys.argv[3]
-        if model_type not in ['i8', 'fp']:
-            print("ERROR: Invalid model type: {}".format(model_type))
-            exit(1)
-        elif model_type == 'i8':
-            do_quant = True
-        else:
-            do_quant = False
-
-    if len(sys.argv) > 4:
-        output_path = sys.argv[4]
-    else:
-        output_path = DEFAULT_RKNN_PATH
-
-    return model_path, platform, do_quant, output_path
-
 if __name__ == '__main__':
-    model_path, platform, do_quant, output_path = parse_arg()
+    parser = argparse.ArgumentParser(description='LPRNet Python Demo', add_help=True)
+    # basic params
+    parser.add_argument('--model_path', type=str, required=True,
+                        help='model path, could be .rknn file')
+    parser.add_argument('--target', type=str,
+                        default='rk3566', help='target RKNPU platform')
+    parser.add_argument('--device_id', type=str,
+                        default=None, help='device id')
+    args = parser.parse_args()
 
     # Create RKNN object
-    rknn = RKNN()
-
-    # Pre-process config
-    print('--> Config model')
-    rknn.config(mean_values=[127.5, 127.5, 127.5], std_values=[127.5, 127.5, 127.5], target_platform=platform)
-    print('done')
+    rknn = RKNN(verbose=True)
 
-    # Load model
-    print('--> Loading model')
-    ret = rknn.load_onnx(model=model_path,
-                         inputs=['input'],
-                         input_size_list=[[1, 3, 24, 94]])
+    # Load RKNN model
+    ret = rknn.load_rknn(args.model_path)
     if ret != 0:
-        print('Load model failed!')
+        print('Load RKNN model \"{}\" failed!'.format(args.model_path))
         exit(ret)
     print('done')
 
-    # Build model
-    print('--> Building model')
-    ret = rknn.build(do_quantization=do_quant, dataset=DATASET_PATH)
-    if ret != 0:
-        print('Build model failed!')
-        exit(ret)
-    print('done')
+    print(args.target)
 
-    # Export rknn model
-    print('--> Export rknn model')
-    ret = rknn.export_rknn(output_path)
+    # Init runtime environment
+    print('--> Init runtime environment')
+    ret = rknn.init_runtime(target=args.target, device_id=args.device_id)
     if ret != 0:
-        print('Export rknn model failed!')
+        print('Init runtime environment failed!')
         exit(ret)
     print('done')
 
     # Set inputs
     img = cv2.imread('../model/test.jpg')
     img = cv2.resize(img, (94, 24))
-    
-    # Init runtime environment
-    print('--> Init runtime environment')
-    ret = rknn.init_runtime()
-    if ret != 0:
-        print('Init runtime environment failed!')
-        exit(ret)
-    print('done')
 
     # Inference
     print('--> Running model')
diff --git a/examples/PPOCR/PPOCR-Det/README.md b/examples/PPOCR/PPOCR-Det/README.md
index 33378fc..934f91d 100644
--- a/examples/PPOCR/PPOCR-Det/README.md
+++ b/examples/PPOCR/PPOCR-Det/README.md
@@ -1,5 +1,9 @@
 # PPOCR-Det
 
+## Current Support Platform
+
+RK3566, RK3568, RK3588, RK3562, RK1808, RV1109, RV1126
+
 
 ## Download ONNX model
 
@@ -37,8 +41,8 @@ python convert.py <onnx_model> <TARGET_PLATFORM> <dtype(optional)> <output_rknn_
 *Description:*
 
 - <onnx_model> should be the ONNX model path.
-- <TARGET_PLATFORM>  could be specified as RK3562, RK3566, RK3568, RK3588 according to board SOC version.
-- <dtype\> is *optional*, could be specified as `i8` or `fp`, `i8` means to do quantization, `fp` means no to do quantization, default is `i8`.
+- <TARGET_PLATFORM>  could be specified as RK3562, RK3566, RK3568, RK3588, RK1808, RV1109, RV1126 according to board SOC version.
+- <dtype\> is *optional*, could be specified as `i8`, `u8` or `fp`, `i8`/`u8` means to do quantization, `fp` means no to do quantization, default is `i8`/`u8`.
 - <output_rknn_path> is **optional**, used to specify the saving path of the RKNN model, default save path is `../model/ppocrv4_det.rknn`
 
 *Attention:*
@@ -48,32 +52,29 @@ python convert.py <onnx_model> <TARGET_PLATFORM> <dtype(optional)> <output_rknn_
 - Our experiment show mmse can bring +0.7%@precision / +1.7%recall / +1.2%@hmean improvement compared to normal quantized_algorithm.
 
 
-## Script Usage
+## Python Demo
 
-Install libs:
+*Usage:*
 
-```bash
-pip install -r python/requirements.txt
-```
+```shell
+cd python
 
-For ONNX:
+# Inference with ONNX model
+python ppocr_det.py --model_path <onnx_model>
+# such as: python ppocr_det.py --model_path ../model/ppocrv4_det.onnx 
 
-```bash
-python python/ppocr_det.py \
-    --image_dir model/test.jpg \
-    --det_model_dir model/ppocrv4_det.onnx \
-    --use_gpu false --use_onnx true
+# Inference with RKNN model
+python ppocr_det.py --model_path <rknn_model> --target <TARGET_PLATFORM>
+# such as: python ppocr_det.py --model_path ../model/ppocrv4_det.rknn --target rk3588
 ```
+*Description:*
+- <TARGET_PLATFORM>: Specify NPU platform name. Such as 'rk3588'.
+
+- <onnx_model / rknn_model>: specified as the model path.
 
-For RKNN:
-```bash
-python python/ppocr_det.py \
-    --image_dir model/test.jpg \
-    --det_model_dir model/ppocrv4_det.rknn \
-    --use_gpu false --use_rknn true --platform rk3568 --det_image_shape 480 480
-```
 
 ## Android Demo
+**Note: RK1808, RV1109, RV1126 does not support Android.**
 
 ### Compiling && Building
 
diff --git a/examples/PPOCR/PPOCR-Det/cpp/CMakeLists.txt b/examples/PPOCR/PPOCR-Det/cpp/CMakeLists.txt
index a5e5f10..3dfdfca 100644
--- a/examples/PPOCR/PPOCR-Det/cpp/CMakeLists.txt
+++ b/examples/PPOCR/PPOCR-Det/cpp/CMakeLists.txt
@@ -41,13 +41,20 @@ message(STATUS OpenCV_LIBS=${OpenCV_LIBS})
 
 set(CMAKE_INSTALL_RPATH "$ORIGIN/../lib")
 
+if (TARGET_SOC STREQUAL "rk1808" OR TARGET_SOC STREQUAL "rv1109" OR TARGET_SOC STREQUAL "rv1126")
+    set(ppocr_det_file rknpu1/ppocr_det.cc)
+else()
+    set(ppocr_det_file rknpu2/ppocr_det.cc)
+endif()
+
+
 file(GLOB SRCS ${CMAKE_CURRENT_SOURCE_DIR}/*.cc)
 
 add_executable(${PROJECT_NAME}
     main.cc
     postprocess.cc
     clipper.cc
-    rknpu2/ppocr_det.cc
+    ${ppocr_det_file}
 )
 
 target_link_libraries(${PROJECT_NAME}
diff --git a/examples/PPOCR/PPOCR-Det/cpp/rknpu1/ppocr_det.cc b/examples/PPOCR/PPOCR-Det/cpp/rknpu1/ppocr_det.cc
new file mode 100644
index 0000000..0d071fc
--- /dev/null
+++ b/examples/PPOCR/PPOCR-Det/cpp/rknpu1/ppocr_det.cc
@@ -0,0 +1,195 @@
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+#include <math.h>
+
+#include "ppocr_det.h"
+#include "common.h"
+#include "file_utils.h"
+#include "image_utils.h"
+
+static void dump_tensor_attr(rknn_tensor_attr* attr)
+{
+    printf("  index=%d, name=%s, n_dims=%d, dims=[%d, %d, %d, %d], n_elems=%d, size=%d, fmt=%s, type=%s, qnt_type=%s, "
+            "zp=%d, scale=%f\n",
+            attr->index, attr->name, attr->n_dims, attr->dims[3], attr->dims[2], attr->dims[1], attr->dims[0],
+            attr->n_elems, attr->size, get_format_string(attr->fmt), get_type_string(attr->type),
+            get_qnt_type_string(attr->qnt_type), attr->zp, attr->scale);
+}
+
+int init_ppocr_det_model(const char* model_path, rknn_app_context_t* app_ctx)
+{
+    int ret;
+    int model_len = 0;
+    char* model;
+    rknn_context ctx = 0;
+
+    // Load RKNN Model
+    model_len = read_data_from_file(model_path, &model);
+    if (model == NULL) {
+        printf("load_model fail!\n");
+        return -1;
+    }
+
+    ret = rknn_init(&ctx, model, model_len, 0);
+    free(model);
+    if (ret < 0) {
+        printf("rknn_init fail! ret=%d\n", ret);
+        return -1;
+    }
+
+    // Get Model Input Output Number
+    rknn_input_output_num io_num;
+    ret = rknn_query(ctx, RKNN_QUERY_IN_OUT_NUM, &io_num, sizeof(io_num));
+    if (ret != RKNN_SUCC) {
+        printf("rknn_query fail! ret=%d\n", ret);
+        return -1;
+    }
+    printf("model input num: %d, output num: %d\n", io_num.n_input, io_num.n_output);
+
+    // Get Model Input Info
+    printf("input tensors:\n");
+    rknn_tensor_attr input_attrs[io_num.n_input];
+    memset(input_attrs, 0, sizeof(input_attrs));
+    for (int i = 0; i < io_num.n_input; i++) {
+        input_attrs[i].index = i;
+        ret = rknn_query(ctx, RKNN_QUERY_INPUT_ATTR, &(input_attrs[i]), sizeof(rknn_tensor_attr));
+        if (ret != RKNN_SUCC) {
+            printf("rknn_query fail! ret=%d\n", ret);
+            return -1;
+        }
+        dump_tensor_attr(&(input_attrs[i]));
+    }
+
+    // Get Model Output Info
+    printf("output tensors:\n");
+    rknn_tensor_attr output_attrs[io_num.n_output];
+    memset(output_attrs, 0, sizeof(output_attrs));
+    for (int i = 0; i < io_num.n_output; i++) {
+        output_attrs[i].index = i;
+        ret = rknn_query(ctx, RKNN_QUERY_OUTPUT_ATTR, &(output_attrs[i]), sizeof(rknn_tensor_attr));
+        if (ret != RKNN_SUCC) {
+            printf("rknn_query fail! ret=%d\n", ret);
+            return -1;
+        }
+        dump_tensor_attr(&(output_attrs[i]));
+    }
+
+    // Set to context
+    app_ctx->rknn_ctx = ctx;
+    app_ctx->io_num = io_num;
+    app_ctx->input_attrs = (rknn_tensor_attr*)malloc(io_num.n_input * sizeof(rknn_tensor_attr));
+    memcpy(app_ctx->input_attrs, input_attrs, io_num.n_input * sizeof(rknn_tensor_attr));
+    app_ctx->output_attrs = (rknn_tensor_attr*)malloc(io_num.n_output * sizeof(rknn_tensor_attr));
+    memcpy(app_ctx->output_attrs, output_attrs, io_num.n_output * sizeof(rknn_tensor_attr));
+
+    if (input_attrs[0].fmt == RKNN_TENSOR_NCHW) {
+        printf("model is NCHW input fmt\n");
+        app_ctx->model_channel = input_attrs[0].dims[2];
+        app_ctx->model_height  = input_attrs[0].dims[1];
+        app_ctx->model_width   = input_attrs[0].dims[0];
+    } else {
+        printf("model is NHWC input fmt\n");
+        app_ctx->model_height = input_attrs[0].dims[2];
+        app_ctx->model_width = input_attrs[0].dims[1];
+        app_ctx->model_channel = input_attrs[0].dims[0];
+    }
+    printf("model input height=%d, width=%d, channel=%d\n",
+        app_ctx->model_height, app_ctx->model_width, app_ctx->model_channel);
+
+    return 0;
+}
+
+int release_ppocr_det_model(rknn_app_context_t* app_ctx)
+{
+    if (app_ctx->input_attrs != NULL) {
+        free(app_ctx->input_attrs);
+        app_ctx->input_attrs = NULL;
+    }
+    if (app_ctx->output_attrs != NULL) {
+        free(app_ctx->output_attrs);
+        app_ctx->output_attrs = NULL;
+    }
+    if (app_ctx->rknn_ctx != 0) {
+        rknn_destroy(app_ctx->rknn_ctx);
+        app_ctx->rknn_ctx = 0;
+    }
+    return 0;
+}
+
+int inference_ppocr_det_model(rknn_app_context_t* app_ctx, image_buffer_t* src_img, ppocr_det_postprocess_params* params, ppocr_det_result* out_result)
+{
+    int ret;
+    image_buffer_t img;
+    rknn_input inputs[1];
+    rknn_output outputs[1];
+
+    memset(&img, 0, sizeof(image_buffer_t));
+    memset(inputs, 0, sizeof(inputs));
+    memset(outputs, 0, sizeof(outputs));
+
+    // Pre Process
+    img.width = app_ctx->model_width;
+    img.height = app_ctx->model_height;
+    img.format = IMAGE_FORMAT_RGB888;
+    img.size = get_image_size(&img);
+    img.virt_addr = (unsigned char*)malloc(img.size);
+    if (img.virt_addr == NULL) {
+        printf("malloc buffer size:%d fail!\n", img.size);
+        return -1;
+    }
+
+    ret = convert_image(src_img, &img, NULL, NULL, 0);
+    if (ret < 0) {
+        printf("convert_image fail! ret=%d\n", ret);
+        return -1;
+    }
+
+    // Set Input Data
+    inputs[0].index = 0;
+    inputs[0].type  = RKNN_TENSOR_UINT8;
+    inputs[0].fmt   = RKNN_TENSOR_NHWC;
+    inputs[0].size  = app_ctx->model_width * app_ctx->model_height * app_ctx->model_channel;
+    inputs[0].buf   = img.virt_addr;
+
+    float scale_w = (float)src_img->width / (float)img.width;
+    float scale_h = (float)src_img->height / (float)img.height;
+
+    ret = rknn_inputs_set(app_ctx->rknn_ctx, 1, inputs);
+    if (ret < 0) {
+        printf("rknn_input_set fail! ret=%d\n", ret);
+        return -1;
+    }
+
+    // Run
+    printf("rknn_run\n");
+    ret = rknn_run(app_ctx->rknn_ctx, nullptr);
+    if (ret < 0) {
+        printf("rknn_run fail! ret=%d\n", ret);
+        return -1;
+    }
+
+    // Get Output
+    outputs[0].want_float = 1;
+    ret = rknn_outputs_get(app_ctx->rknn_ctx, 1, outputs, NULL);
+    if (ret < 0) {
+        printf("rknn_outputs_get fail! ret=%d\n", ret);
+        goto out;
+    }
+
+    // Post Process
+    ret = dbnet_postprocess((float*)outputs[0].buf, app_ctx->model_width, app_ctx->model_height, 
+                                                params->threshold, params->box_threshold, params->use_dilate, params->db_score_mode, 
+                                                params->db_unclip_ratio, params->db_box_type,
+                                                scale_w, scale_h, out_result);
+    
+    // Remeber to release rknn output
+    rknn_outputs_release(app_ctx->rknn_ctx, 1, outputs);
+
+out:
+    if (img.virt_addr != NULL) {
+        free(img.virt_addr);
+    }
+
+    return ret;
+}
\ No newline at end of file
diff --git a/examples/PPOCR/PPOCR-Det/cpp/rknpu2/ppocr_det.cc b/examples/PPOCR/PPOCR-Det/cpp/rknpu2/ppocr_det.cc
index 4dcee75..ba4e026 100644
--- a/examples/PPOCR/PPOCR-Det/cpp/rknpu2/ppocr_det.cc
+++ b/examples/PPOCR/PPOCR-Det/cpp/rknpu2/ppocr_det.cc
@@ -102,10 +102,6 @@ int init_ppocr_det_model(const char* model_path, rknn_app_context_t* app_ctx)
 
 int release_ppocr_det_model(rknn_app_context_t* app_ctx)
 {
-    if (app_ctx->rknn_ctx != 0) {
-        rknn_destroy(app_ctx->rknn_ctx);
-        app_ctx->rknn_ctx = 0;
-    }
     if (app_ctx->input_attrs != NULL) {
         free(app_ctx->input_attrs);
         app_ctx->input_attrs = NULL;
@@ -114,6 +110,10 @@ int release_ppocr_det_model(rknn_app_context_t* app_ctx)
         free(app_ctx->output_attrs);
         app_ctx->output_attrs = NULL;
     }
+    if (app_ctx->rknn_ctx != 0) {
+        rknn_destroy(app_ctx->rknn_ctx);
+        app_ctx->rknn_ctx = 0;
+    }
     return 0;
 }
 
diff --git a/examples/PPOCR/PPOCR-Det/python/convert.py b/examples/PPOCR/PPOCR-Det/python/convert.py
index a631ece..82a0d01 100644
--- a/examples/PPOCR/PPOCR-Det/python/convert.py
+++ b/examples/PPOCR/PPOCR-Det/python/convert.py
@@ -11,12 +11,9 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
-import os
 import sys
-from tabnanny import verbose
 from rknn.api import RKNN
 
-
 DATASET_PATH = '../../../../datasets/PPOCR/imgs/dataset_20.txt'
 DEFAULT_RKNN_PATH = '../model/ppocrv4_det.rknn'
 DEFAULT_QUANT = True
@@ -24,8 +21,9 @@
 def parse_arg():
     if len(sys.argv) < 3:
         print("Usage: python3 {} onnx_model_path [platform] [dtype(optional)] [output_rknn_path(optional)]".format(sys.argv[0]));
-        print("       platform choose from [rk3562,rk3566,rk3568,rk3588]")
-        print("       dtype choose from    [i8, fp]")
+        print("       platform choose from [rk3562, rk3566, rk3568, rk3588, rk1808, rv1109, rv1126]")
+        print("       dtype choose from    [i8, fp] for [rk3562,rk3566,rk3568,rk3588]")
+        print("       dtype choose from    [u8, fp] for [rk1808,rv1109,rv1126]")
         exit(1)
 
     model_path = sys.argv[1]
@@ -34,10 +32,10 @@ def parse_arg():
     do_quant = DEFAULT_QUANT
     if len(sys.argv) > 3:
         model_type = sys.argv[3]
-        if model_type not in ['i8', 'fp']:
+        if model_type not in ['i8', 'u8', 'fp']:
             print("ERROR: Invalid model type: {}".format(model_type))
             exit(1)
-        elif model_type == 'i8':
+        elif model_type in ['i8', 'u8']:
             do_quant = True
         else:
             do_quant = False
@@ -52,33 +50,37 @@ def parse_arg():
 if __name__ == '__main__':
     model_path, platform, do_quant, output_path = parse_arg()
 
-    model = RKNN(verbose=False)
+    # Create RKNN object
+    rknn = RKNN(verbose=False)
 
-    # Config
-    model.config(
-        mean_values=[123.675, 116.28, 103.53],
-        std_values=[58.395, 57.12, 57.375],
-        target_platform=platform,
-    )
+    # Pre-process config
+    print('--> Config model')
+    rknn.config(mean_values=[[123.675, 116.28, 103.53]], std_values=[[58.395, 57.12, 57.375]], target_platform=platform)
+    print('done')
 
-    # Load ONNX model
-    ret = model.load_onnx(model=model_path)
+    # Load model
+    print('--> Loading model')
+    ret = rknn.load_onnx(model=model_path)
+    if ret != 0:
+        print('Load model failed!')
+        exit(ret)
+    print('done')
 
     # Build model
-    ret = model.build(
-        do_quantization=do_quant,
-        dataset=DATASET_PATH)
-    assert ret == 0, "Build model failed!"
-
-    # Init Runtime
-    # ret = model.init_runtime()
-    # assert ret == 0, "Init runtime environment failed!"
+    print('--> Building model')
+    ret = rknn.build(do_quantization=do_quant, dataset=DATASET_PATH)
+    if ret != 0:
+        print('Build model failed!')
+        exit(ret)
+    print('done')
 
-    # Export
-    if not os.path.exists(os.path.dirname(output_path)):
-        os.mkdir(os.path.dirname(output_path))
+    # Export rknn model
+    print('--> Export rknn model')
+    ret = rknn.export_rknn(output_path)
+    if ret != 0:
+        print('Export rknn model failed!')
+        exit(ret)
+    print('done')
 
-    ret = model.export_rknn(
-        output_path)
-    assert ret == 0, "Export rknn model failed!"
-    print("Export OK!")
+    # Release
+    rknn.release()
\ No newline at end of file
diff --git a/examples/PPOCR/PPOCR-Det/python/ppocr_det.py b/examples/PPOCR/PPOCR-Det/python/ppocr_det.py
index f4d66b7..529ded7 100644
--- a/examples/PPOCR/PPOCR-Det/python/ppocr_det.py
+++ b/examples/PPOCR/PPOCR-Det/python/ppocr_det.py
@@ -13,367 +13,144 @@
 # limitations under the License.
 import os
 import sys
-
-__dir__ = os.path.dirname(os.path.abspath(__file__))
-sys.path.append(__dir__)
-sys.path.insert(0, os.path.abspath(os.path.join(__dir__, '../..')))
-
-os.environ["FLAGS_allocator_strategy"] = 'auto_growth'
-
 import cv2
 import numpy as np
-import time
-import sys
+import argparse
+import utils.operators
+from utils.db_postprocess import DBPostProcess, DetPostProcess
 
-import utility
-from paddleocr.ppocr.utils.logging import get_logger
-from paddleocr.ppocr.utils.utility import get_image_file_list, check_and_read
-from paddleocr.ppocr.data import create_operators, transform
-from paddleocr.ppocr.postprocess import build_post_process
-import json
-logger = get_logger()
+# add path
+realpath = os.path.abspath(__file__)
+_sep = os.path.sep
+realpath = realpath.split(_sep)
+sys.path.append(os.path.join(realpath[0]+_sep, *realpath[1:realpath.index('rknn_model_zoo')+1]))
 
 
-class TextDetector(object):
-    def __init__(self, args):
-        self.args = args
-        self.det_algorithm = args.det_algorithm
-        self.use_onnx = args.use_onnx
-        self.use_rknn = args.use_rknn
-        pre_process_list = [{
-            'DetResizeForTest': {
-                'limit_side_len': args.det_limit_side_len,
-                'limit_type': args.det_limit_type,
+DET_INPUT_SHAPE = [480, 480] # h,w
+
+ONNX_PRE_PROCESS_CONFIG = [
+        {
+            'DetResizeForTest': 
+            {
+                'limit_side_len': 480,
+                'limit_type': 'max',
             }
-        }, {
+        }, 
+        {
             'NormalizeImage': {
                 'std': [0.229, 0.224, 0.225],
                 'mean': [0.485, 0.456, 0.406],
                 'scale': '1./255.',
                 'order': 'hwc'
             }
-        }, {
-            'ToCHWImage': None
-        }, {
-            'KeepKeys': {
-                'keep_keys': ['image', 'shape']
-            }
-        }]
-        postprocess_params = {}
-        if self.det_algorithm == "DB":
-            postprocess_params['name'] = 'DBPostProcess'
-            postprocess_params["thresh"] = args.det_db_thresh
-            postprocess_params["box_thresh"] = args.det_db_box_thresh
-            postprocess_params["max_candidates"] = 1000
-            postprocess_params["unclip_ratio"] = args.det_db_unclip_ratio
-            postprocess_params["use_dilation"] = args.use_dilation
-            postprocess_params["score_mode"] = args.det_db_score_mode
-            postprocess_params["box_type"] = args.det_box_type
-        elif self.det_algorithm == "DB++":
-            postprocess_params['name'] = 'DBPostProcess'
-            postprocess_params["thresh"] = args.det_db_thresh
-            postprocess_params["box_thresh"] = args.det_db_box_thresh
-            postprocess_params["max_candidates"] = 1000
-            postprocess_params["unclip_ratio"] = args.det_db_unclip_ratio
-            postprocess_params["use_dilation"] = args.use_dilation
-            postprocess_params["score_mode"] = args.det_db_score_mode
-            postprocess_params["box_type"] = args.det_box_type
-            pre_process_list[1] = {
-                'NormalizeImage': {
-                    'std': [1.0, 1.0, 1.0],
-                    'mean':
-                    [0.48109378172549, 0.45752457890196, 0.40787054090196],
-                    'scale': '1./255.',
-                    'order': 'hwc'
-                }
-            }
-        elif self.det_algorithm == "EAST":
-            postprocess_params['name'] = 'EASTPostProcess'
-            postprocess_params["score_thresh"] = args.det_east_score_thresh
-            postprocess_params["cover_thresh"] = args.det_east_cover_thresh
-            postprocess_params["nms_thresh"] = args.det_east_nms_thresh
-        elif self.det_algorithm == "SAST":
-            pre_process_list[0] = {
-                'DetResizeForTest': {
-                    'resize_long': args.det_limit_side_len
-                }
-            }
-            postprocess_params['name'] = 'SASTPostProcess'
-            postprocess_params["score_thresh"] = args.det_sast_score_thresh
-            postprocess_params["nms_thresh"] = args.det_sast_nms_thresh
-
-            if args.det_box_type == 'poly':
-                postprocess_params["sample_pts_num"] = 6
-                postprocess_params["expand_scale"] = 1.2
-                postprocess_params["shrink_ratio_of_width"] = 0.2
-            else:
-                postprocess_params["sample_pts_num"] = 2
-                postprocess_params["expand_scale"] = 1.0
-                postprocess_params["shrink_ratio_of_width"] = 0.3
-
-        elif self.det_algorithm == "PSE":
-            postprocess_params['name'] = 'PSEPostProcess'
-            postprocess_params["thresh"] = args.det_pse_thresh
-            postprocess_params["box_thresh"] = args.det_pse_box_thresh
-            postprocess_params["min_area"] = args.det_pse_min_area
-            postprocess_params["box_type"] = args.det_box_type
-            postprocess_params["scale"] = args.det_pse_scale
-        elif self.det_algorithm == "FCE":
-            pre_process_list[0] = {
-                'DetResizeForTest': {
-                    'rescale_img': [1080, 736]
-                }
-            }
-            postprocess_params['name'] = 'FCEPostProcess'
-            postprocess_params["scales"] = args.scales
-            postprocess_params["alpha"] = args.alpha
-            postprocess_params["beta"] = args.beta
-            postprocess_params["fourier_degree"] = args.fourier_degree
-            postprocess_params["box_type"] = args.det_box_type
-        elif self.det_algorithm == "CT":
-            pre_process_list[0] = {'ScaleAlignedShort': {'short_size': 640}}
-            postprocess_params['name'] = 'CTPostProcess'
-        else:
-            logger.info("unknown det_algorithm:{}".format(self.det_algorithm))
-            sys.exit(0)
-
-        self.preprocess_op = create_operators(pre_process_list)
-        self.postprocess_op = build_post_process(postprocess_params)
-        self.predictor, self.input_tensor, self.output_tensors, self.config = utility.create_predictor(
-            args, 'det', logger)
+        }, 
+        ]
 
-        if self.use_onnx:
-            img_h, img_w = self.input_tensor.shape[2:]
-            if isinstance(img_h, str) or isinstance(img_w, str):
-                pass
-            elif img_h is not None and img_w is not None and img_h > 0 and img_w > 0:
-                pre_process_list[0] = {
-                    'DetResizeForTest': {
-                        'image_shape': [img_h, img_w]
-                    }
-                }
-
-        if self.use_rknn:
-            pre_process_list[0] = {
-                'DetResizeForTest': {
-                        'image_shape': args.det_image_shape
+RKNN_PRE_PROCESS_CONFIG = [
+        {
+            'DetResizeForTest': {
+                    'image_shape': DET_INPUT_SHAPE
                 }
-            }
-            pre_process_list[1] = {
-                'NormalizeImage': {
+         }, 
+        {
+            'NormalizeImage': 
+            {
                     'std': [1., 1., 1.],
                     'mean': [0., 0., 0.],
                     'scale': '1.',
                     'order': 'hwc'
-                }
             }
-        self.preprocess_op = create_operators(pre_process_list)
-
-        if args.benchmark:
-            import auto_log
-            pid = os.getpid()
-            gpu_id = utility.get_infer_gpuid()
-            self.autolog = auto_log.AutoLogger(
-                model_name="det",
-                model_precision=args.precision,
-                batch_size=1,
-                data_shape="dynamic",
-                save_path=None,
-                inference_config=self.config,
-                pids=pid,
-                process_name=None,
-                gpu_ids=gpu_id if args.use_gpu else None,
-                time_keys=[
-                    'preprocess_time', 'inference_time', 'postprocess_time'
-                ],
-                warmup=2,
-                logger=logger)
-
-    def order_points_clockwise(self, pts):
-        rect = np.zeros((4, 2), dtype="float32")
-        s = pts.sum(axis=1)
-        rect[0] = pts[np.argmin(s)]
-        rect[2] = pts[np.argmax(s)]
-        tmp = np.delete(pts, (np.argmin(s), np.argmax(s)), axis=0)
-        diff = np.diff(np.array(tmp), axis=1)
-        rect[1] = tmp[np.argmin(diff)]
-        rect[3] = tmp[np.argmax(diff)]
-        return rect
-
-    def clip_det_res(self, points, img_height, img_width):
-        for pno in range(points.shape[0]):
-            points[pno, 0] = int(min(max(points[pno, 0], 0), img_width - 1))
-            points[pno, 1] = int(min(max(points[pno, 1], 0), img_height - 1))
-        return points
-
-    def filter_tag_det_res(self, dt_boxes, image_shape):
-        img_height, img_width = image_shape[0:2]
-        dt_boxes_new = []
-        for box in dt_boxes:
-            if type(box) is list:
-                box = np.array(box)
-            box = self.order_points_clockwise(box)
-            box = self.clip_det_res(box, img_height, img_width)
-            rect_width = int(np.linalg.norm(box[0] - box[1]))
-            rect_height = int(np.linalg.norm(box[0] - box[3]))
-            if rect_width <= 3 or rect_height <= 3:
-                continue
-            dt_boxes_new.append(box)
-        dt_boxes = np.array(dt_boxes_new)
-        return dt_boxes
-
-    def filter_tag_det_res_only_clip(self, dt_boxes, image_shape):
-        img_height, img_width = image_shape[0:2]
-        dt_boxes_new = []
-        for box in dt_boxes:
-            if type(box) is list:
-                box = np.array(box)
-            box = self.clip_det_res(box, img_height, img_width)
-            dt_boxes_new.append(box)
-        dt_boxes = np.array(dt_boxes_new)
-        return dt_boxes
-
-    def release_rknn(self):
-        self.predictor.release()
-
-    def __call__(self, img):
-        ori_im = img.copy()
-        data = {'image': img}
-
-        st = time.time()
-
-        if self.args.benchmark:
-            self.autolog.times.start()
-
-        data = transform(data, self.preprocess_op)
-        img, shape_list = data
-        if img is None:
-            return None, 0
-        img = np.expand_dims(img, axis=0)
-        shape_list = np.expand_dims(shape_list, axis=0)
-        img = img.copy()
-
-        if self.args.benchmark:
-            self.autolog.times.stamp()
-        if self.use_onnx:
-            input_dict = {}
-            input_dict[self.input_tensor.name] = img
-            outputs = self.predictor.run(self.output_tensors, input_dict)
-        elif self.use_rknn:
-            img = img[0].transpose(1, 2, 0)
-            outputs = self.predictor.inference(inputs=[img])
-        else:
-            self.input_tensor.copy_from_cpu(img)
-            self.predictor.run()
-            outputs = []
-            for output_tensor in self.output_tensors:
-                output = output_tensor.copy_to_cpu()
-                outputs.append(output)
-            if self.args.benchmark:
-                self.autolog.times.stamp()
-
-        preds = {}
-        if self.det_algorithm == "EAST":
-            preds['f_geo'] = outputs[0]
-            preds['f_score'] = outputs[1]
-        elif self.det_algorithm == 'SAST':
-            preds['f_border'] = outputs[0]
-            preds['f_score'] = outputs[1]
-            preds['f_tco'] = outputs[2]
-            preds['f_tvo'] = outputs[3]
-        elif self.det_algorithm in ['DB', 'PSE', 'DB++']:
-            preds['maps'] = outputs[0]
-        elif self.det_algorithm == 'FCE':
-            for i, output in enumerate(outputs):
-                preds['level_{}'.format(i)] = output
-        elif self.det_algorithm == "CT":
-            preds['maps'] = outputs[0]
-            preds['score'] = outputs[1]
-        else:
-            raise NotImplementedError
-
-        post_result = self.postprocess_op(preds, shape_list)
-        dt_boxes = post_result[0]['points']
-
-        if self.args.det_box_type == 'poly':
-            dt_boxes = self.filter_tag_det_res_only_clip(dt_boxes, ori_im.shape)
-        else:
-            dt_boxes = self.filter_tag_det_res(dt_boxes, ori_im.shape)
-
-        if self.args.benchmark:
-            self.autolog.times.end(stamp=True)
-        et = time.time()
-        return dt_boxes, et - st
-
-
-if __name__ == "__main__":
-    args = utility.parse_args()
-    image_file_list = get_image_file_list(args.image_dir)
-    text_detector = TextDetector(args)
-    total_time = 0
-    draw_img_save_dir = args.draw_img_save_dir
-    os.makedirs(draw_img_save_dir, exist_ok=True)
-
-    if args.warmup:
-        img = np.random.uniform(0, 255, [640, 640, 3]).astype(np.uint8)
-        for i in range(2):
-            res = text_detector(img)
-
-    save_results = []
-    for idx, image_file in enumerate(image_file_list):
-        img, flag_gif, flag_pdf = check_and_read(image_file)
-        if not flag_gif and not flag_pdf:
-            img = cv2.imread(image_file)
-        if not flag_pdf:
-            if img is None:
-                logger.debug("error in loading image:{}".format(image_file))
-                continue
-            imgs = [img]
-        else:
-            page_num = args.page_num
-            if page_num > len(img) or page_num == 0:
-                page_num = len(img)
-            imgs = img[:page_num]
-        for index, img in enumerate(imgs):
-            st = time.time()
-            dt_boxes, _ = text_detector(img)
-            elapse = time.time() - st
-            total_time += elapse
-            if len(imgs) > 1:
-                save_pred = os.path.basename(image_file) + '_' + str(
-                    index) + "\t" + str(
-                        json.dumps([x.tolist() for x in dt_boxes])) + "\n"
-            else:
-                save_pred = os.path.basename(image_file) + "\t" + str(
-                    json.dumps([x.tolist() for x in dt_boxes])) + "\n"
-            save_results.append(save_pred)
-            logger.info(save_pred)
-            if len(imgs) > 1:
-                logger.info("{}_{} The predict time of {}: {}".format(
-                    idx, index, image_file, elapse))
-            else:
-                logger.info("{} The predict time of {}: {}".format(
-                    idx, image_file, elapse))
-
-            src_im = utility.draw_text_det_res(dt_boxes, img)
-
-            if flag_gif:
-                save_file = image_file[:-3] + "png"
-            elif flag_pdf:
-                save_file = image_file.replace('.pdf',
-                                               '_' + str(index) + '.png')
-            else:
-                save_file = image_file
-            img_path = os.path.join(
-                draw_img_save_dir,
-                "det_res_{}".format(os.path.basename(save_file)))
-            cv2.imwrite(img_path, src_im)
-            logger.info("The visualized image saved in {}".format(img_path))
-
-    with open(os.path.join(draw_img_save_dir, "det_results.txt"), 'w') as f:
-        f.writelines(save_results)
-        f.close()
-    if args.benchmark:
-        text_detector.autolog.report()
-    if args.use_rknn:
-        text_detector.release_rknn()
+        }
+        ]
+
+POSTPROCESS_CONFIG = {
+    'DBPostProcess':{
+        'thresh': 0.3,
+        'box_thresh': 0.6,
+        'max_candidates': 1000,
+        'unclip_ratio': 1.5,
+        'use_dilation': False,
+        'score_mode': 'fast',
+    }
+}
+
+class TextDetector:
+    def __init__(self, args) -> None:
+        self.model, self.framework = setup_model(args)
+        self.preprocess_funct = []
+        PRE_PROCESS_CONFIG = ONNX_PRE_PROCESS_CONFIG if self.framework == 'onnx' else RKNN_PRE_PROCESS_CONFIG
+        for item in PRE_PROCESS_CONFIG:
+            for key in item:
+                pclass = getattr(utils.operators, key)
+                p = pclass(**item[key])
+                self.preprocess_funct.append(p)
+
+        self.db_postprocess = DBPostProcess(**POSTPROCESS_CONFIG['DBPostProcess'])
+        self.det_postprocess = DetPostProcess()
+
+    def preprocess(self, img):
+        for p in self.preprocess_funct:
+            img = p(img)
+
+        if self.framework == 'onnx':
+            image_input = img['image']
+            image_input = image_input.reshape(1, *image_input.shape)
+            image_input = image_input.transpose(0, 3, 1, 2)
+            img['image'] = image_input
+        return img
+
+    def run(self, img):
+        model_input = self.preprocess({'image':img})
+        output = self.model.run([model_input['image']])
+
+        preds = {'maps' : output[0].astype(np.float32)}
+        result = self.db_postprocess(preds, model_input['shape'])
+
+        output = self.det_postprocess.filter_tag_det_res(result[0]['points'], img.shape)
+        return output
+
+def setup_model(args):
+    model_path = args.model_path
+    if model_path.endswith('.rknn'):
+        platform = 'rknn'
+        from py_utils.rknn_executor import RKNN_model_container 
+        model = RKNN_model_container(model_path, args.target, args.device_id)
+    elif model_path.endswith('onnx'):
+        platform = 'onnx'
+        from py_utils.onnx_executor import ONNX_model_container
+        model = ONNX_model_container(model_path)
+    else:
+        assert False, "{} is not rknn/onnx model".format(model_path)
+    print('Model-{} is {} model, starting val'.format(model_path, platform))
+    return model, platform
+
+def init_args():
+    parser = argparse.ArgumentParser(description='PPOCR-Det Python Demo')
+    # basic params
+    parser.add_argument('--model_path', type=str, required= True, help='model path, could be .onnx or .rknn file')
+    parser.add_argument('--target', type=str, default='rk3566', help='target RKNPU platform')
+    parser.add_argument('--device_id', type=str, default=None, help='device id')
+    return parser
+
+if __name__ == '__main__':
+    # Init model
+    parser = init_args()
+    args =  parser.parse_args()
+    det_model = TextDetector(args)
+    
+    # Set inputs
+    img_path = '../model/test.jpg'
+    img = cv2.imread(img_path)
+    img = cv2.resize(img, (DET_INPUT_SHAPE[1], DET_INPUT_SHAPE[0]))
+
+    # Inference
+    output = det_model.run(img)
+
+    # Post Process
+    for box in output:
+        box = np.array(box).astype(np.int32)
+        cv2.polylines(img, [box], True, (0, 255, 0), 2)
+    cv2.imshow('img', img)
+    cv2.waitKey(0)
+
+    print(output.tolist())
\ No newline at end of file
diff --git a/examples/PPOCR/PPOCR-Det/python/requirements.txt b/examples/PPOCR/PPOCR-Det/python/requirements.txt
deleted file mode 100644
index 4c587ce..0000000
--- a/examples/PPOCR/PPOCR-Det/python/requirements.txt
+++ /dev/null
@@ -1 +0,0 @@
-paddleocr==2.6
\ No newline at end of file
diff --git a/examples/PPOCR/PPOCR-Det/python/utility.py b/examples/PPOCR/PPOCR-Det/python/utility.py
deleted file mode 100644
index 0a71871..0000000
--- a/examples/PPOCR/PPOCR-Det/python/utility.py
+++ /dev/null
@@ -1,777 +0,0 @@
-# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-import argparse
-import os
-import sys
-import platform
-import cv2
-import numpy as np
-import paddle
-from PIL import Image, ImageDraw, ImageFont
-import math
-from paddle import inference
-import time
-import random
-import logging
-
-
-def str2bool(v):
-    return v.lower() in ("true", "t", "1")
-
-
-def init_args():
-    parser = argparse.ArgumentParser()
-    # params for prediction engine
-    parser.add_argument("--use_gpu", type=str2bool, default=True)
-    parser.add_argument("--use_xpu", type=str2bool, default=False)
-    parser.add_argument("--use_npu", type=str2bool, default=False)
-    parser.add_argument("--ir_optim", type=str2bool, default=True)
-    parser.add_argument("--use_tensorrt", type=str2bool, default=False)
-    parser.add_argument("--min_subgraph_size", type=int, default=15)
-    parser.add_argument("--precision", type=str, default="fp32")
-    parser.add_argument("--gpu_mem", type=int, default=500)
-    parser.add_argument("--gpu_id", type=int, default=0)
-
-    # params for text detector
-    parser.add_argument("--image_dir", type=str)
-    parser.add_argument("--page_num", type=int, default=0)
-    parser.add_argument("--det_algorithm", type=str, default='DB')
-    parser.add_argument("--det_model_dir", type=str)
-    parser.add_argument("--det_limit_side_len", type=float, default=960)
-    parser.add_argument("--det_image_shape", type=int, nargs='+', default=[960, 960], help="[h, w]")
-    parser.add_argument("--det_limit_type", type=str, default='max')
-    parser.add_argument("--det_box_type", type=str, default='quad')
-
-    # DB parmas
-    parser.add_argument("--det_db_thresh", type=float, default=0.3)
-    parser.add_argument("--det_db_box_thresh", type=float, default=0.6)
-    parser.add_argument("--det_db_unclip_ratio", type=float, default=1.5)
-    parser.add_argument("--max_batch_size", type=int, default=10)
-    parser.add_argument("--use_dilation", type=str2bool, default=False)
-    parser.add_argument("--det_db_score_mode", type=str, default="fast")
-
-    # EAST parmas
-    parser.add_argument("--det_east_score_thresh", type=float, default=0.8)
-    parser.add_argument("--det_east_cover_thresh", type=float, default=0.1)
-    parser.add_argument("--det_east_nms_thresh", type=float, default=0.2)
-
-    # SAST parmas
-    parser.add_argument("--det_sast_score_thresh", type=float, default=0.5)
-    parser.add_argument("--det_sast_nms_thresh", type=float, default=0.2)
-
-    # PSE parmas
-    parser.add_argument("--det_pse_thresh", type=float, default=0)
-    parser.add_argument("--det_pse_box_thresh", type=float, default=0.85)
-    parser.add_argument("--det_pse_min_area", type=float, default=16)
-    parser.add_argument("--det_pse_scale", type=int, default=1)
-
-    # FCE parmas
-    parser.add_argument("--scales", type=list, default=[8, 16, 32])
-    parser.add_argument("--alpha", type=float, default=1.0)
-    parser.add_argument("--beta", type=float, default=1.0)
-    parser.add_argument("--fourier_degree", type=int, default=5)
-
-    # params for text recognizer
-    parser.add_argument("--rec_algorithm", type=str, default='SVTR_LCNet')
-    parser.add_argument("--rec_model_dir", type=str)
-    parser.add_argument("--rec_image_inverse", type=str2bool, default=True)
-    parser.add_argument("--rec_image_shape", type=str, default="3, 48, 320")
-    parser.add_argument("--rec_batch_num", type=int, default=6)
-    parser.add_argument("--max_text_length", type=int, default=25)
-    parser.add_argument(
-        "--rec_char_dict_path",
-        type=str,
-        default="./ppocr/utils/ppocr_keys_v1.txt")
-    parser.add_argument("--use_space_char", type=str2bool, default=True)
-    parser.add_argument(
-        "--vis_font_path", type=str, default="./doc/fonts/simfang.ttf")
-    parser.add_argument("--drop_score", type=float, default=0.5)
-
-    # params for e2e
-    parser.add_argument("--e2e_algorithm", type=str, default='PGNet')
-    parser.add_argument("--e2e_model_dir", type=str)
-    parser.add_argument("--e2e_limit_side_len", type=float, default=768)
-    parser.add_argument("--e2e_limit_type", type=str, default='max')
-
-    # PGNet parmas
-    parser.add_argument("--e2e_pgnet_score_thresh", type=float, default=0.5)
-    parser.add_argument(
-        "--e2e_char_dict_path", type=str, default="./ppocr/utils/ic15_dict.txt")
-    parser.add_argument("--e2e_pgnet_valid_set", type=str, default='totaltext')
-    parser.add_argument("--e2e_pgnet_mode", type=str, default='fast')
-
-    # params for text classifier
-    parser.add_argument("--use_angle_cls", type=str2bool, default=False)
-    parser.add_argument("--cls_model_dir", type=str)
-    parser.add_argument("--cls_image_shape", type=str, default="3, 48, 192")
-    parser.add_argument("--label_list", type=list, default=['0', '180'])
-    parser.add_argument("--cls_batch_num", type=int, default=6)
-    parser.add_argument("--cls_thresh", type=float, default=0.9)
-
-    parser.add_argument("--enable_mkldnn", type=str2bool, default=False)
-    parser.add_argument("--cpu_threads", type=int, default=10)
-    parser.add_argument("--use_pdserving", type=str2bool, default=False)
-    parser.add_argument("--warmup", type=str2bool, default=False)
-
-    # SR parmas
-    parser.add_argument("--sr_model_dir", type=str)
-    parser.add_argument("--sr_image_shape", type=str, default="3, 32, 128")
-    parser.add_argument("--sr_batch_num", type=int, default=1)
-
-    #
-    parser.add_argument(
-        "--draw_img_save_dir", type=str, default="./inference_results")
-    parser.add_argument("--save_crop_res", type=str2bool, default=False)
-    parser.add_argument("--crop_res_save_dir", type=str, default="./output")
-
-    # multi-process
-    parser.add_argument("--use_mp", type=str2bool, default=False)
-    parser.add_argument("--total_process_num", type=int, default=1)
-    parser.add_argument("--process_id", type=int, default=0)
-
-    parser.add_argument("--benchmark", type=str2bool, default=False)
-    parser.add_argument("--save_log_path", type=str, default="./log_output/")
-
-    parser.add_argument("--show_log", type=str2bool, default=True)
-    parser.add_argument("--use_onnx", type=str2bool, default=False)
-    parser.add_argument("--use_rknn", type=str2bool, default=False)
-    parser.add_argument("--platform", type=str, default="rk3568")
-    return parser
-
-
-def parse_args():
-    parser = init_args()
-    return parser.parse_args()
-
-
-def create_predictor(args, mode, logger):
-    if mode == "det":
-        model_dir = args.det_model_dir
-    elif mode == 'cls':
-        model_dir = args.cls_model_dir
-    elif mode == 'rec':
-        model_dir = args.rec_model_dir
-    elif mode == 'table':
-        model_dir = args.table_model_dir
-    elif mode == 'ser':
-        model_dir = args.ser_model_dir
-    elif mode == 're':
-        model_dir = args.re_model_dir
-    elif mode == "sr":
-        model_dir = args.sr_model_dir
-    elif mode == 'layout':
-        model_dir = args.layout_model_dir
-    else:
-        model_dir = args.e2e_model_dir
-
-    if model_dir is None:
-        logger.info("not find {} model file path {}".format(mode, model_dir))
-        sys.exit(0)
-    if args.use_onnx:
-        import onnxruntime as ort
-        model_file_path = model_dir
-        if not os.path.exists(model_file_path):
-            raise ValueError("not find model file path {}".format(
-                model_file_path))
-        sess = ort.InferenceSession(model_file_path)
-        return sess, sess.get_inputs()[0], None, None
-    elif args.use_rknn:
-        from rknn.api import RKNN
-        rknn = RKNN()
-        print('--> Load rknn model')
-        model_file_path = model_dir
-        if not os.path.exists(model_file_path):
-            raise ValueError("not find model file path {}".format(
-                model_file_path))
-        ret = rknn.load_rknn(model_file_path)
-        if ret != 0:
-            print('Load rknn model failed!')
-            exit(ret)
-        print('done')
-        print('--> Init runtime environment')
-        # ret = rknn.init_runtime()
-        ret = rknn.init_runtime(args.platform)
-        if ret != 0:
-            print('Init runtime environment failed!')
-            exit(ret)
-        print('done')
-        return rknn, None, None, None
-    else:
-        file_names = ['model', 'inference']
-        for file_name in file_names:
-            model_file_path = '{}/{}.pdmodel'.format(model_dir, file_name)
-            params_file_path = '{}/{}.pdiparams'.format(model_dir, file_name)
-            if os.path.exists(model_file_path) and os.path.exists(
-                    params_file_path):
-                break
-        if not os.path.exists(model_file_path):
-            raise ValueError(
-                "not find model.pdmodel or inference.pdmodel in {}".format(
-                    model_dir))
-        if not os.path.exists(params_file_path):
-            raise ValueError(
-                "not find model.pdiparams or inference.pdiparams in {}".format(
-                    model_dir))
-
-        config = inference.Config(model_file_path, params_file_path)
-
-        if hasattr(args, 'precision'):
-            if args.precision == "fp16" and args.use_tensorrt:
-                precision = inference.PrecisionType.Half
-            elif args.precision == "int8":
-                precision = inference.PrecisionType.Int8
-            else:
-                precision = inference.PrecisionType.Float32
-        else:
-            precision = inference.PrecisionType.Float32
-
-        if args.use_gpu:
-            gpu_id = get_infer_gpuid()
-            if gpu_id is None:
-                logger.warning(
-                    "GPU is not found in current device by nvidia-smi. Please check your device or ignore it if run on jetson."
-                )
-            config.enable_use_gpu(args.gpu_mem, args.gpu_id)
-            if args.use_tensorrt:
-                config.enable_tensorrt_engine(
-                    workspace_size=1 << 30,
-                    precision_mode=precision,
-                    max_batch_size=args.max_batch_size,
-                    min_subgraph_size=args.
-                    min_subgraph_size,  # skip the minmum trt subgraph
-                    use_calib_mode=False)
-
-                # collect shape
-                trt_shape_f = os.path.join(model_dir,
-                                           f"{mode}_trt_dynamic_shape.txt")
-
-                if not os.path.exists(trt_shape_f):
-                    config.collect_shape_range_info(trt_shape_f)
-                    logger.info(
-                        f"collect dynamic shape info into : {trt_shape_f}")
-                try:
-                    config.enable_tuned_tensorrt_dynamic_shape(trt_shape_f,
-                                                               True)
-                except Exception as E:
-                    logger.info(E)
-                    logger.info("Please keep your paddlepaddle-gpu >= 2.3.0!")
-
-        elif args.use_npu:
-            config.enable_custom_device("npu")
-        elif args.use_xpu:
-            config.enable_xpu(10 * 1024 * 1024)
-        else:
-            config.disable_gpu()
-            if args.enable_mkldnn:
-                # cache 10 different shapes for mkldnn to avoid memory leak
-                config.set_mkldnn_cache_capacity(10)
-                config.enable_mkldnn()
-                if args.precision == "fp16":
-                    config.enable_mkldnn_bfloat16()
-                if hasattr(args, "cpu_threads"):
-                    config.set_cpu_math_library_num_threads(args.cpu_threads)
-                else:
-                    # default cpu threads as 10
-                    config.set_cpu_math_library_num_threads(10)
-        # enable memory optim
-        config.enable_memory_optim()
-        config.disable_glog_info()
-        config.delete_pass("conv_transpose_eltwiseadd_bn_fuse_pass")
-        config.delete_pass("matmul_transpose_reshape_fuse_pass")
-        if mode == 're':
-            config.delete_pass("simplify_with_basic_ops_pass")
-        if mode == 'table':
-            config.delete_pass("fc_fuse_pass")  # not supported for table
-        config.switch_use_feed_fetch_ops(False)
-        config.switch_ir_optim(True)
-
-        # create predictor
-        predictor = inference.create_predictor(config)
-        input_names = predictor.get_input_names()
-        if mode in ['ser', 're']:
-            input_tensor = []
-            for name in input_names:
-                input_tensor.append(predictor.get_input_handle(name))
-        else:
-            for name in input_names:
-                input_tensor = predictor.get_input_handle(name)
-        output_tensors = get_output_tensors(args, mode, predictor)
-        return predictor, input_tensor, output_tensors, config
-
-
-def get_output_tensors(args, mode, predictor):
-    output_names = predictor.get_output_names()
-    output_tensors = []
-    if mode == "rec" and args.rec_algorithm in [
-            "CRNN", "SVTR_LCNet", "SVTR_HGNet"
-    ]:
-        output_name = 'softmax_0.tmp_0'
-        if output_name in output_names:
-            return [predictor.get_output_handle(output_name)]
-        else:
-            for output_name in output_names:
-                output_tensor = predictor.get_output_handle(output_name)
-                output_tensors.append(output_tensor)
-    else:
-        for output_name in output_names:
-            output_tensor = predictor.get_output_handle(output_name)
-            output_tensors.append(output_tensor)
-    return output_tensors
-
-
-def get_infer_gpuid():
-    sysstr = platform.system()
-    if sysstr == "Windows":
-        return 0
-
-    if not paddle.device.is_compiled_with_rocm:
-        cmd = "env | grep CUDA_VISIBLE_DEVICES"
-    else:
-        cmd = "env | grep HIP_VISIBLE_DEVICES"
-    env_cuda = os.popen(cmd).readlines()
-    if len(env_cuda) == 0:
-        return 0
-    else:
-        gpu_id = env_cuda[0].strip().split("=")[1]
-        return int(gpu_id[0])
-
-
-def draw_e2e_res(dt_boxes, strs, img_path):
-    src_im = cv2.imread(img_path)
-    for box, str in zip(dt_boxes, strs):
-        box = box.astype(np.int32).reshape((-1, 1, 2))
-        cv2.polylines(src_im, [box], True, color=(255, 255, 0), thickness=2)
-        cv2.putText(
-            src_im,
-            str,
-            org=(int(box[0, 0, 0]), int(box[0, 0, 1])),
-            fontFace=cv2.FONT_HERSHEY_COMPLEX,
-            fontScale=0.7,
-            color=(0, 255, 0),
-            thickness=1)
-    return src_im
-
-
-def draw_text_det_res(dt_boxes, img):
-    for box in dt_boxes:
-        box = np.array(box).astype(np.int32).reshape(-1, 2)
-        cv2.polylines(img, [box], True, color=(255, 255, 0), thickness=2)
-    return img
-
-
-def resize_img(img, input_size=600):
-    """
-    resize img and limit the longest side of the image to input_size
-    """
-    img = np.array(img)
-    im_shape = img.shape
-    im_size_max = np.max(im_shape[0:2])
-    im_scale = float(input_size) / float(im_size_max)
-    img = cv2.resize(img, None, None, fx=im_scale, fy=im_scale)
-    return img
-
-
-def draw_ocr(image,
-             boxes,
-             txts=None,
-             scores=None,
-             drop_score=0.5,
-             font_path="./doc/fonts/simfang.ttf"):
-    """
-    Visualize the results of OCR detection and recognition
-    args:
-        image(Image|array): RGB image
-        boxes(list): boxes with shape(N, 4, 2)
-        txts(list): the texts
-        scores(list): txxs corresponding scores
-        drop_score(float): only scores greater than drop_threshold will be visualized
-        font_path: the path of font which is used to draw text
-    return(array):
-        the visualized img
-    """
-    if scores is None:
-        scores = [1] * len(boxes)
-    box_num = len(boxes)
-    for i in range(box_num):
-        if scores is not None and (scores[i] < drop_score or
-                                   math.isnan(scores[i])):
-            continue
-        box = np.reshape(np.array(boxes[i]), [-1, 1, 2]).astype(np.int64)
-        image = cv2.polylines(np.array(image), [box], True, (255, 0, 0), 2)
-    if txts is not None:
-        img = np.array(resize_img(image, input_size=600))
-        txt_img = text_visual(
-            txts,
-            scores,
-            img_h=img.shape[0],
-            img_w=600,
-            threshold=drop_score,
-            font_path=font_path)
-        img = np.concatenate([np.array(img), np.array(txt_img)], axis=1)
-        return img
-    return image
-
-
-def draw_ocr_box_txt(image,
-                     boxes,
-                     txts=None,
-                     scores=None,
-                     drop_score=0.5,
-                     font_path="./doc/fonts/simfang.ttf"):
-    h, w = image.height, image.width
-    img_left = image.copy()
-    img_right = np.ones((h, w, 3), dtype=np.uint8) * 255
-    random.seed(0)
-
-    draw_left = ImageDraw.Draw(img_left)
-    if txts is None or len(txts) != len(boxes):
-        txts = [None] * len(boxes)
-    for idx, (box, txt) in enumerate(zip(boxes, txts)):
-        if scores is not None and scores[idx] < drop_score:
-            continue
-        color = (random.randint(0, 255), random.randint(0, 255),
-                 random.randint(0, 255))
-        draw_left.polygon(box, fill=color)
-        img_right_text = draw_box_txt_fine((w, h), box, txt, font_path)
-        pts = np.array(box, np.int32).reshape((-1, 1, 2))
-        cv2.polylines(img_right_text, [pts], True, color, 1)
-        img_right = cv2.bitwise_and(img_right, img_right_text)
-    img_left = Image.blend(image, img_left, 0.5)
-    img_show = Image.new('RGB', (w * 2, h), (255, 255, 255))
-    img_show.paste(img_left, (0, 0, w, h))
-    img_show.paste(Image.fromarray(img_right), (w, 0, w * 2, h))
-    return np.array(img_show)
-
-
-def draw_box_txt_fine(img_size, box, txt, font_path="./doc/fonts/simfang.ttf"):
-    box_height = int(
-        math.sqrt((box[0][0] - box[3][0])**2 + (box[0][1] - box[3][1])**2))
-    box_width = int(
-        math.sqrt((box[0][0] - box[1][0])**2 + (box[0][1] - box[1][1])**2))
-
-    if box_height > 2 * box_width and box_height > 30:
-        img_text = Image.new('RGB', (box_height, box_width), (255, 255, 255))
-        draw_text = ImageDraw.Draw(img_text)
-        if txt:
-            font = create_font(txt, (box_height, box_width), font_path)
-            draw_text.text([0, 0], txt, fill=(0, 0, 0), font=font)
-        img_text = img_text.transpose(Image.ROTATE_270)
-    else:
-        img_text = Image.new('RGB', (box_width, box_height), (255, 255, 255))
-        draw_text = ImageDraw.Draw(img_text)
-        if txt:
-            font = create_font(txt, (box_width, box_height), font_path)
-            draw_text.text([0, 0], txt, fill=(0, 0, 0), font=font)
-
-    pts1 = np.float32(
-        [[0, 0], [box_width, 0], [box_width, box_height], [0, box_height]])
-    pts2 = np.array(box, dtype=np.float32)
-    M = cv2.getPerspectiveTransform(pts1, pts2)
-
-    img_text = np.array(img_text, dtype=np.uint8)
-    img_right_text = cv2.warpPerspective(
-        img_text,
-        M,
-        img_size,
-        flags=cv2.INTER_NEAREST,
-        borderMode=cv2.BORDER_CONSTANT,
-        borderValue=(255, 255, 255))
-    return img_right_text
-
-
-def create_font(txt, sz, font_path="./doc/fonts/simfang.ttf"):
-    font_size = int(sz[1] * 0.99)
-    font = ImageFont.truetype(font_path, font_size, encoding="utf-8")
-    length = font.getlength(txt)
-    if length > sz[0]:
-        font_size = int(font_size * sz[0] / length)
-        font = ImageFont.truetype(font_path, font_size, encoding="utf-8")
-    return font
-
-
-def str_count(s):
-    """
-    Count the number of Chinese characters,
-    a single English character and a single number
-    equal to half the length of Chinese characters.
-    args:
-        s(string): the input of string
-    return(int):
-        the number of Chinese characters
-    """
-    import string
-    count_zh = count_pu = 0
-    s_len = len(s)
-    en_dg_count = 0
-    for c in s:
-        if c in string.ascii_letters or c.isdigit() or c.isspace():
-            en_dg_count += 1
-        elif c.isalpha():
-            count_zh += 1
-        else:
-            count_pu += 1
-    return s_len - math.ceil(en_dg_count / 2)
-
-
-def text_visual(texts,
-                scores,
-                img_h=400,
-                img_w=600,
-                threshold=0.,
-                font_path="./doc/simfang.ttf"):
-    """
-    create new blank img and draw txt on it
-    args:
-        texts(list): the text will be draw
-        scores(list|None): corresponding score of each txt
-        img_h(int): the height of blank img
-        img_w(int): the width of blank img
-        font_path: the path of font which is used to draw text
-    return(array):
-    """
-    if scores is not None:
-        assert len(texts) == len(
-            scores), "The number of txts and corresponding scores must match"
-
-    def create_blank_img():
-        blank_img = np.ones(shape=[img_h, img_w], dtype=np.int8) * 255
-        blank_img[:, img_w - 1:] = 0
-        blank_img = Image.fromarray(blank_img).convert("RGB")
-        draw_txt = ImageDraw.Draw(blank_img)
-        return blank_img, draw_txt
-
-    blank_img, draw_txt = create_blank_img()
-
-    font_size = 20
-    txt_color = (0, 0, 0)
-    font = ImageFont.truetype(font_path, font_size, encoding="utf-8")
-
-    gap = font_size + 5
-    txt_img_list = []
-    count, index = 1, 0
-    for idx, txt in enumerate(texts):
-        index += 1
-        if scores[idx] < threshold or math.isnan(scores[idx]):
-            index -= 1
-            continue
-        first_line = True
-        while str_count(txt) >= img_w // font_size - 4:
-            tmp = txt
-            txt = tmp[:img_w // font_size - 4]
-            if first_line:
-                new_txt = str(index) + ': ' + txt
-                first_line = False
-            else:
-                new_txt = '    ' + txt
-            draw_txt.text((0, gap * count), new_txt, txt_color, font=font)
-            txt = tmp[img_w // font_size - 4:]
-            if count >= img_h // gap - 1:
-                txt_img_list.append(np.array(blank_img))
-                blank_img, draw_txt = create_blank_img()
-                count = 0
-            count += 1
-        if first_line:
-            new_txt = str(index) + ': ' + txt + '   ' + '%.3f' % (scores[idx])
-        else:
-            new_txt = "  " + txt + "  " + '%.3f' % (scores[idx])
-        draw_txt.text((0, gap * count), new_txt, txt_color, font=font)
-        # whether add new blank img or not
-        if count >= img_h // gap - 1 and idx + 1 < len(texts):
-            txt_img_list.append(np.array(blank_img))
-            blank_img, draw_txt = create_blank_img()
-            count = 0
-        count += 1
-    txt_img_list.append(np.array(blank_img))
-    if len(txt_img_list) == 1:
-        blank_img = np.array(txt_img_list[0])
-    else:
-        blank_img = np.concatenate(txt_img_list, axis=1)
-    return np.array(blank_img)
-
-
-def base64_to_cv2(b64str):
-    import base64
-    data = base64.b64decode(b64str.encode('utf8'))
-    data = np.frombuffer(data, np.uint8)
-    data = cv2.imdecode(data, cv2.IMREAD_COLOR)
-    return data
-
-
-def draw_boxes(image, boxes, scores=None, drop_score=0.5):
-    if scores is None:
-        scores = [1] * len(boxes)
-    for (box, score) in zip(boxes, scores):
-        if score < drop_score:
-            continue
-        box = np.reshape(np.array(box), [-1, 1, 2]).astype(np.int64)
-        image = cv2.polylines(np.array(image), [box], True, (255, 0, 0), 2)
-    return image
-
-
-def get_rotate_crop_image(img, points):
-    '''
-    img_height, img_width = img.shape[0:2]
-    left = int(np.min(points[:, 0]))
-    right = int(np.max(points[:, 0]))
-    top = int(np.min(points[:, 1]))
-    bottom = int(np.max(points[:, 1]))
-    img_crop = img[top:bottom, left:right, :].copy()
-    points[:, 0] = points[:, 0] - left
-    points[:, 1] = points[:, 1] - top
-    '''
-    assert len(points) == 4, "shape of points must be 4*2"
-    img_crop_width = int(
-        max(
-            np.linalg.norm(points[0] - points[1]),
-            np.linalg.norm(points[2] - points[3])))
-    img_crop_height = int(
-        max(
-            np.linalg.norm(points[0] - points[3]),
-            np.linalg.norm(points[1] - points[2])))
-    pts_std = np.float32([[0, 0], [img_crop_width, 0],
-                          [img_crop_width, img_crop_height],
-                          [0, img_crop_height]])
-    M = cv2.getPerspectiveTransform(points, pts_std)
-    dst_img = cv2.warpPerspective(
-        img,
-        M, (img_crop_width, img_crop_height),
-        borderMode=cv2.BORDER_REPLICATE,
-        flags=cv2.INTER_CUBIC)
-    dst_img_height, dst_img_width = dst_img.shape[0:2]
-    if dst_img_height * 1.0 / dst_img_width >= 1.5:
-        dst_img = np.rot90(dst_img)
-    return dst_img
-
-
-def get_minarea_rect_crop(img, points):
-    bounding_box = cv2.minAreaRect(np.array(points).astype(np.int32))
-    points = sorted(list(cv2.boxPoints(bounding_box)), key=lambda x: x[0])
-
-    index_a, index_b, index_c, index_d = 0, 1, 2, 3
-    if points[1][1] > points[0][1]:
-        index_a = 0
-        index_d = 1
-    else:
-        index_a = 1
-        index_d = 0
-    if points[3][1] > points[2][1]:
-        index_b = 2
-        index_c = 3
-    else:
-        index_b = 3
-        index_c = 2
-
-    box = [points[index_a], points[index_b], points[index_c], points[index_d]]
-    crop_img = get_rotate_crop_image(img, np.array(box))
-    return crop_img, box
-
-
-def check_gpu(use_gpu):
-    if use_gpu and not paddle.is_compiled_with_cuda():
-        use_gpu = False
-    return use_gpu
-
-
-def _check_image_file(path):
-    img_end = {'jpg', 'bmp', 'png', 'jpeg', 'rgb', 'tif', 'tiff', 'gif', 'pdf'}
-    return any([path.lower().endswith(e) for e in img_end])
-
-
-def get_image_file_list(img_file):
-    imgs_lists = []
-    if img_file is None or not os.path.exists(img_file):
-        raise Exception("not found any img file in {}".format(img_file))
-
-    img_end = {'jpg', 'bmp', 'png', 'jpeg', 'rgb', 'tif', 'tiff', 'gif', 'pdf'}
-    if os.path.isfile(img_file) and _check_image_file(img_file):
-        imgs_lists.append(img_file)
-    elif os.path.isdir(img_file):
-        for single_file in os.listdir(img_file):
-            file_path = os.path.join(img_file, single_file)
-            if os.path.isfile(file_path) and _check_image_file(file_path):
-                imgs_lists.append(file_path)
-    if len(imgs_lists) == 0:
-        raise Exception("not found any img file in {}".format(img_file))
-    imgs_lists = sorted(imgs_lists)
-    return imgs_lists
-
-
-def check_and_read(img_path):
-    if os.path.basename(img_path)[-3:] in ['gif', 'GIF']:
-        gif = cv2.VideoCapture(img_path)
-        ret, frame = gif.read()
-        if not ret:
-            logger = logging.getLogger('ppocr')
-            logger.info("Cannot read {}. This gif image maybe corrupted.")
-            return None, False
-        if len(frame.shape) == 2 or frame.shape[-1] == 1:
-            frame = cv2.cvtColor(frame, cv2.COLOR_GRAY2RGB)
-        imgvalue = frame[:, :, ::-1]
-        return imgvalue, True, False
-    elif os.path.basename(img_path)[-3:] in ['pdf']:
-        import fitz
-        from PIL import Image
-        imgs = []
-        with fitz.open(img_path) as pdf:
-            for pg in range(0, pdf.pageCount):
-                page = pdf[pg]
-                mat = fitz.Matrix(2, 2)
-                pm = page.getPixmap(matrix=mat, alpha=False)
-
-                # if width or height > 2000 pixels, don't enlarge the image
-                if pm.width > 2000 or pm.height > 2000:
-                    pm = page.getPixmap(matrix=fitz.Matrix(1, 1), alpha=False)
-
-                img = Image.frombytes("RGB", [pm.width, pm.height], pm.samples)
-                img = cv2.cvtColor(np.array(img), cv2.COLOR_RGB2BGR)
-                imgs.append(img)
-            return imgs, False, True
-    return None, False, False
-
-
-def create_operators(op_param_list, global_config=None):
-    """
-    create operators based on the config
-
-    Args:
-        params(list): a dict list, used to create some operators
-    """
-    assert isinstance(op_param_list, list), ('operator config should be a list')
-    ops = []
-    for operator in op_param_list:
-        assert isinstance(operator,
-                          dict) and len(operator) == 1, "yaml format error"
-        op_name = list(operator)[0]
-        param = {} if operator[op_name] is None else operator[op_name]
-        if global_config is not None:
-            param.update(global_config)
-        op = eval(op_name)(**param)
-        ops.append(op)
-    return ops
-
-
-def transform(data, ops=None):
-    """ transform """
-    if ops is None:
-        ops = []
-    for op in ops:
-        data = op(data)
-        if data is None:
-            return None
-    return data
-
-
-if __name__ == '__main__':
-    pass
diff --git a/examples/PPOCR/PPOCR-Det/python/utils/__init__.py b/examples/PPOCR/PPOCR-Det/python/utils/__init__.py
new file mode 100644
index 0000000..e69de29
diff --git a/examples/PPOCR/PPOCR-Det/python/utils/db_postprocess.py b/examples/PPOCR/PPOCR-Det/python/utils/db_postprocess.py
new file mode 100644
index 0000000..ac50634
--- /dev/null
+++ b/examples/PPOCR/PPOCR-Det/python/utils/db_postprocess.py
@@ -0,0 +1,269 @@
+# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""
+This code is refered from:
+https://github.com/WenmuZhou/DBNet.pytorch/blob/master/post_processing/seg_detector_representer.py
+"""
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import numpy as np
+import cv2
+# import paddle
+from shapely.geometry import Polygon
+import pyclipper
+
+
+class DBPostProcess(object):
+    """
+    The post process for Differentiable Binarization (DB).
+    """
+
+    def __init__(self,
+                 thresh=0.3,
+                 box_thresh=0.7,
+                 max_candidates=1000,
+                 unclip_ratio=2.0,
+                 use_dilation=False,
+                 score_mode="fast",
+                 **kwargs):
+        self.thresh = thresh
+        self.box_thresh = box_thresh
+        self.max_candidates = max_candidates
+        self.unclip_ratio = unclip_ratio
+        self.min_size = 3
+        self.score_mode = score_mode
+        assert score_mode in [
+            "slow", "fast"
+        ], "Score mode must be in [slow, fast] but got: {}".format(score_mode)
+
+        self.dilation_kernel = None if not use_dilation else np.array(
+            [[1, 1], [1, 1]])
+
+    def boxes_from_bitmap(self, pred, _bitmap, dest_width, dest_height):
+        '''
+        _bitmap: single map with shape (1, H, W),
+                whose values are binarized as {0, 1}
+        '''
+
+        bitmap = _bitmap
+        height, width = bitmap.shape
+
+        outs = cv2.findContours((bitmap * 255).astype(np.uint8), cv2.RETR_LIST,
+                                cv2.CHAIN_APPROX_SIMPLE)
+        if len(outs) == 3:
+            img, contours, _ = outs[0], outs[1], outs[2]
+        elif len(outs) == 2:
+            contours, _ = outs[0], outs[1]
+
+        num_contours = min(len(contours), self.max_candidates)
+
+        boxes = []
+        scores = []
+        for index in range(num_contours):
+            contour = contours[index]
+            points, sside = self.get_mini_boxes(contour)
+            if sside < self.min_size:
+                continue
+            points = np.array(points)
+            if self.score_mode == "fast":
+                score = self.box_score_fast(pred, points.reshape(-1, 2))
+            else:
+                score = self.box_score_slow(pred, contour)
+            if self.box_thresh > score:
+                continue
+
+            box = self.unclip(points).reshape(-1, 1, 2)
+            box, sside = self.get_mini_boxes(box)
+            if sside < self.min_size + 2:
+                continue
+            box = np.array(box)
+
+            box[:, 0] = np.clip(
+                np.round(box[:, 0] / width * dest_width), 0, dest_width)
+            box[:, 1] = np.clip(
+                np.round(box[:, 1] / height * dest_height), 0, dest_height)
+            boxes.append(box.astype(np.int16))
+            scores.append(score)
+        return np.array(boxes, dtype=np.int16), scores
+
+    def unclip(self, box):
+        unclip_ratio = self.unclip_ratio
+        poly = Polygon(box)
+        distance = poly.area * unclip_ratio / poly.length
+        offset = pyclipper.PyclipperOffset()
+        offset.AddPath(box, pyclipper.JT_ROUND, pyclipper.ET_CLOSEDPOLYGON)
+        expanded = np.array(offset.Execute(distance))
+        return expanded
+
+    def get_mini_boxes(self, contour):
+        bounding_box = cv2.minAreaRect(contour)
+        points = sorted(list(cv2.boxPoints(bounding_box)), key=lambda x: x[0])
+
+        index_1, index_2, index_3, index_4 = 0, 1, 2, 3
+        if points[1][1] > points[0][1]:
+            index_1 = 0
+            index_4 = 1
+        else:
+            index_1 = 1
+            index_4 = 0
+        if points[3][1] > points[2][1]:
+            index_2 = 2
+            index_3 = 3
+        else:
+            index_2 = 3
+            index_3 = 2
+
+        box = [
+            points[index_1], points[index_2], points[index_3], points[index_4]
+        ]
+        return box, min(bounding_box[1])
+
+    def box_score_fast(self, bitmap, _box):
+        '''
+        box_score_fast: use bbox mean score as the mean score
+        '''
+        h, w = bitmap.shape[:2]
+        box = _box.copy()
+        xmin = np.clip(np.floor(box[:, 0].min()).astype(np.int32), 0, w - 1)
+        xmax = np.clip(np.ceil(box[:, 0].max()).astype(np.int32), 0, w - 1)
+        ymin = np.clip(np.floor(box[:, 1].min()).astype(np.int32), 0, h - 1)
+        ymax = np.clip(np.ceil(box[:, 1].max()).astype(np.int32), 0, h - 1)
+
+        mask = np.zeros((ymax - ymin + 1, xmax - xmin + 1), dtype=np.uint8)
+        box[:, 0] = box[:, 0] - xmin
+        box[:, 1] = box[:, 1] - ymin
+        cv2.fillPoly(mask, box.reshape(1, -1, 2).astype(np.int32), 1)
+        return cv2.mean(bitmap[ymin:ymax + 1, xmin:xmax + 1], mask)[0]
+
+    def box_score_slow(self, bitmap, contour):
+        '''
+        box_score_slow: use polyon mean score as the mean score
+        '''
+        h, w = bitmap.shape[:2]
+        contour = contour.copy()
+        contour = np.reshape(contour, (-1, 2))
+
+        xmin = np.clip(np.min(contour[:, 0]), 0, w - 1)
+        xmax = np.clip(np.max(contour[:, 0]), 0, w - 1)
+        ymin = np.clip(np.min(contour[:, 1]), 0, h - 1)
+        ymax = np.clip(np.max(contour[:, 1]), 0, h - 1)
+
+        mask = np.zeros((ymax - ymin + 1, xmax - xmin + 1), dtype=np.uint8)
+
+        contour[:, 0] = contour[:, 0] - xmin
+        contour[:, 1] = contour[:, 1] - ymin
+
+        cv2.fillPoly(mask, contour.reshape(1, -1, 2).astype(np.int32), 1)
+        return cv2.mean(bitmap[ymin:ymax + 1, xmin:xmax + 1], mask)[0]
+
+    def __call__(self, outs_dict, shape_list):
+        pred = outs_dict['maps']
+        # if isinstance(pred, paddle.Tensor):
+        #     pred = pred.numpy()
+        pred = pred[:, 0, :, :]
+        segmentation = pred > self.thresh
+
+        boxes_batch = []
+        for batch_index in range(pred.shape[0]):
+            src_h, src_w, ratio_h, ratio_w = shape_list[batch_index]
+            if self.dilation_kernel is not None:
+                mask = cv2.dilate(
+                    np.array(segmentation[batch_index]).astype(np.uint8),
+                    self.dilation_kernel)
+            else:
+                mask = segmentation[batch_index]
+            boxes, scores = self.boxes_from_bitmap(pred[batch_index], mask,
+                                                   src_w, src_h)
+
+            boxes_batch.append({'points': boxes})
+        return boxes_batch
+
+
+class DistillationDBPostProcess(object):
+    def __init__(self,
+                 model_name=["student"],
+                 key=None,
+                 thresh=0.3,
+                 box_thresh=0.6,
+                 max_candidates=1000,
+                 unclip_ratio=1.5,
+                 use_dilation=False,
+                 score_mode="fast",
+                 **kwargs):
+        self.model_name = model_name
+        self.key = key
+        self.post_process = DBPostProcess(
+            thresh=thresh,
+            box_thresh=box_thresh,
+            max_candidates=max_candidates,
+            unclip_ratio=unclip_ratio,
+            use_dilation=use_dilation,
+            score_mode=score_mode)
+
+    def __call__(self, predicts, shape_list):
+        results = {}
+        for k in self.model_name:
+            results[k] = self.post_process(predicts[k], shape_list=shape_list)
+        return results
+
+
+class DetPostProcess(object):
+    def __init__(self) -> None:
+        pass
+
+    def order_points_clockwise(self, pts):
+        """
+        reference from: https://github.com/jrosebr1/imutils/blob/master/imutils/perspective.py
+        # sort the points based on their x-coordinates
+        """
+        xSorted = pts[np.argsort(pts[:, 0]), :]
+
+        # grab the left-most and right-most points from the sorted
+        # x-roodinate points
+        leftMost = xSorted[:2, :]
+        rightMost = xSorted[2:, :]
+
+        # now, sort the left-most coordinates according to their
+        # y-coordinates so we can grab the top-left and bottom-left
+        # points, respectively
+        leftMost = leftMost[np.argsort(leftMost[:, 1]), :]
+        (tl, bl) = leftMost
+
+        rightMost = rightMost[np.argsort(rightMost[:, 1]), :]
+        (tr, br) = rightMost
+
+        rect = np.array([tl, tr, br, bl], dtype="float32")
+        return rect
+
+    def clip_det_res(self, points, img_height, img_width):
+        for pno in range(points.shape[0]):
+            points[pno, 0] = int(min(max(points[pno, 0], 0), img_width - 1))
+            points[pno, 1] = int(min(max(points[pno, 1], 0), img_height - 1))
+        return points
+
+    def filter_tag_det_res(self, dt_boxes, image_shape):
+        img_height, img_width = image_shape[0:2]
+        dt_boxes_new = []
+        for box in dt_boxes:
+            box = self.order_points_clockwise(box)
+            box = self.clip_det_res(box, img_height, img_width)
+            rect_width = int(np.linalg.norm(box[0] - box[1]))
+            rect_height = int(np.linalg.norm(box[0] - box[3]))
+            if rect_width <= 3 or rect_height <= 3:
+                continue
+            dt_boxes_new.append(box)
+        dt_boxes = np.array(dt_boxes_new)
+        return dt_boxes
diff --git a/examples/PPOCR/PPOCR-Det/python/utils/operators.py b/examples/PPOCR/PPOCR-Det/python/utils/operators.py
new file mode 100644
index 0000000..f19c15f
--- /dev/null
+++ b/examples/PPOCR/PPOCR-Det/python/utils/operators.py
@@ -0,0 +1,373 @@
+"""
+# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+from __future__ import unicode_literals
+
+import sys
+import six
+import cv2
+import numpy as np
+
+
+class DecodeImage(object):
+    """ decode image """
+
+    def __init__(self, img_mode='RGB', channel_first=False, **kwargs):
+        self.img_mode = img_mode
+        self.channel_first = channel_first
+
+    def __call__(self, data):
+        img = data['image']
+        if six.PY2:
+            assert type(img) is str and len(
+                img) > 0, "invalid input 'img' in DecodeImage"
+        else:
+            assert type(img) is bytes and len(
+                img) > 0, "invalid input 'img' in DecodeImage"
+        img = np.frombuffer(img, dtype='uint8')
+        img = cv2.imdecode(img, 1)
+        if img is None:
+            return None
+        if self.img_mode == 'GRAY':
+            img = cv2.cvtColor(img, cv2.COLOR_GRAY2BGR)
+        elif self.img_mode == 'RGB':
+            assert img.shape[2] == 3, 'invalid shape of image[%s]' % (img.shape)
+            img = img[:, :, ::-1]
+
+        if self.channel_first:
+            img = img.transpose((2, 0, 1))
+
+        data['image'] = img
+        return data
+
+
+class NRTRDecodeImage(object):
+    """ decode image """
+
+    def __init__(self, img_mode='RGB', channel_first=False, **kwargs):
+        self.img_mode = img_mode
+        self.channel_first = channel_first
+
+    def __call__(self, data):
+        img = data['image']
+        if six.PY2:
+            assert type(img) is str and len(
+                img) > 0, "invalid input 'img' in DecodeImage"
+        else:
+            assert type(img) is bytes and len(
+                img) > 0, "invalid input 'img' in DecodeImage"
+        img = np.frombuffer(img, dtype='uint8')
+
+        img = cv2.imdecode(img, 1)
+
+        if img is None:
+            return None
+        if self.img_mode == 'GRAY':
+            img = cv2.cvtColor(img, cv2.COLOR_GRAY2BGR)
+        elif self.img_mode == 'RGB':
+            assert img.shape[2] == 3, 'invalid shape of image[%s]' % (img.shape)
+            img = img[:, :, ::-1]
+        img = cv2.cvtColor(img,cv2.COLOR_BGR2GRAY)
+        if self.channel_first:
+            img = img.transpose((2, 0, 1))
+        data['image'] = img
+        return data
+
+class NormalizeImage(object):
+    """ normalize image such as substract mean, divide std
+    """
+
+    def __init__(self, scale=None, mean=None, std=None, order='chw', **kwargs):
+        if isinstance(scale, str):
+            scale = eval(scale)
+        self.scale = np.float32(scale if scale is not None else 1.0 / 255.0)
+        mean = mean if mean is not None else [0.485, 0.456, 0.406]
+        std = std if std is not None else [0.229, 0.224, 0.225]
+
+        shape = (3, 1, 1) if order == 'chw' else (1, 1, 3)
+        self.mean = np.array(mean).reshape(shape).astype('float32')
+        self.std = np.array(std).reshape(shape).astype('float32')
+
+    def __call__(self, data):
+        img = data['image']
+        from PIL import Image
+        if isinstance(img, Image.Image):
+            img = np.array(img)
+
+        assert isinstance(img,
+                          np.ndarray), "invalid input 'img' in NormalizeImage"
+        data['image'] = (
+                                img.astype('float32') * self.scale - self.mean) / self.std
+        return data
+
+
+class ToCHWImage(object):
+    """ convert hwc image to chw image
+    """
+
+    def __init__(self, **kwargs):
+        pass
+
+    def __call__(self, data):
+        img = data['image']
+        from PIL import Image
+        if isinstance(img, Image.Image):
+            img = np.array(img)
+        data['image'] = img.transpose((2, 0, 1))
+        return data
+
+
+class KeepKeys(object):
+    def __init__(self, keep_keys, **kwargs):
+        self.keep_keys = keep_keys
+
+    def __call__(self, data):
+        data_list = []
+        for key in self.keep_keys:
+            data_list.append(data[key])
+        return data_list
+
+
+class DetResizeForTest(object):
+    def __init__(self, **kwargs):
+        super(DetResizeForTest, self).__init__()
+        self.square_input = True
+        self.resize_type = 0
+        if 'image_shape' in kwargs:
+            self.image_shape = kwargs['image_shape']
+            self.resize_type = 1
+        elif 'limit_side_len' in kwargs:
+            self.limit_side_len = kwargs['limit_side_len']
+            self.limit_type = kwargs.get('limit_type', 'min')
+        elif 'resize_long' in kwargs:
+            self.resize_type = 2
+            self.resize_long = kwargs.get('resize_long', 960)
+        else:
+            self.limit_side_len = 736
+            self.limit_type = 'min'
+
+    def __call__(self, data):
+        img = data['image']
+        src_h, src_w, _ = img.shape
+
+        if self.resize_type == 0:
+            # img, shape = self.resize_image_type0(img)
+            img, [ratio_h, ratio_w] = self.resize_image_type0(img)
+        elif self.resize_type == 2:
+            img, [ratio_h, ratio_w] = self.resize_image_type2(img)
+        else:
+            # img, shape = self.resize_image_type1(img)
+            img, [ratio_h, ratio_w] = self.resize_image_type1(img)
+        
+
+
+        data['image'] = img
+        data['shape'] = np.array([src_h, src_w, ratio_h, ratio_w])
+        if len(data['shape'].shape) == 1:
+            data['shape'] = np.expand_dims(data['shape'], axis=0) 
+        return data
+
+    def resize_image_type1(self, img):
+        resize_h, resize_w = self.image_shape
+        ori_h, ori_w = img.shape[:2]  # (h, w, c)
+        ratio_h = float(resize_h) / ori_h
+        ratio_w = float(resize_w) / ori_w
+        img = cv2.resize(img, (int(resize_w), int(resize_h)))
+        # return img, np.array([ori_h, ori_w])
+        return img, [ratio_h, ratio_w]
+
+    def resize_image_type0(self, img):
+        """
+        resize image to a size multiple of 32 which is required by the network
+        args:
+            img(array): array with shape [h, w, c]
+        return(tuple):
+            img, (ratio_h, ratio_w)
+        """
+        limit_side_len = self.limit_side_len
+        h, w, c = img.shape
+
+        # limit the max side
+        if self.limit_type == 'max':
+            if max(h, w) > limit_side_len:
+                if h > w:
+                    ratio = float(limit_side_len) / h
+                else:
+                    ratio = float(limit_side_len) / w
+            else:
+                ratio = 1.
+        elif self.limit_type == 'min':
+            if min(h, w) < limit_side_len:
+                if h < w:
+                    ratio = float(limit_side_len) / h
+                else:
+                    ratio = float(limit_side_len) / w
+            else:
+                ratio = 1.
+        elif self.limit_type == 'resize_long':
+            ratio = float(limit_side_len) / max(h,w)
+        else:
+            raise Exception('not support limit type, image ')
+        resize_h = int(h * ratio)
+        resize_w = int(w * ratio)
+
+        resize_h = max(int(round(resize_h / 32) * 32), 32)
+        resize_w = max(int(round(resize_w / 32) * 32), 32)
+
+        try:
+            if int(resize_w) <= 0 or int(resize_h) <= 0:
+                return None, (None, None)
+            img = cv2.resize(img, (int(resize_w), int(resize_h)))
+        except:
+            print(img.shape, resize_w, resize_h)
+            sys.exit(0)
+        ratio_h = resize_h / float(h)
+        ratio_w = resize_w / float(w)
+        return img, [ratio_h, ratio_w]
+
+    def resize_image_type2(self, img):
+        h, w, _ = img.shape
+
+        resize_w = w
+        resize_h = h
+
+        if resize_h > resize_w:
+            ratio = float(self.resize_long) / resize_h
+        else:
+            ratio = float(self.resize_long) / resize_w
+
+        resize_h = int(resize_h * ratio)
+        resize_w = int(resize_w * ratio)
+
+        max_stride = 128
+        resize_h = (resize_h + max_stride - 1) // max_stride * max_stride
+        resize_w = (resize_w + max_stride - 1) // max_stride * max_stride
+        img = cv2.resize(img, (int(resize_w), int(resize_h)))
+        ratio_h = resize_h / float(h)
+        ratio_w = resize_w / float(w)
+
+        return img, [ratio_h, ratio_w]
+
+
+class E2EResizeForTest(object):
+    def __init__(self, **kwargs):
+        super(E2EResizeForTest, self).__init__()
+        self.max_side_len = kwargs['max_side_len']
+        self.valid_set = kwargs['valid_set']
+
+    def __call__(self, data):
+        img = data['image']
+        src_h, src_w, _ = img.shape
+        if self.valid_set == 'totaltext':
+            im_resized, [ratio_h, ratio_w] = self.resize_image_for_totaltext(
+                img, max_side_len=self.max_side_len)
+        else:
+            im_resized, (ratio_h, ratio_w) = self.resize_image(
+                img, max_side_len=self.max_side_len)
+        data['image'] = im_resized
+        data['shape'] = np.array([src_h, src_w, ratio_h, ratio_w])
+        return data
+
+    def resize_image_for_totaltext(self, im, max_side_len=512):
+
+        h, w, _ = im.shape
+        resize_w = w
+        resize_h = h
+        ratio = 1.25
+        if h * ratio > max_side_len:
+            ratio = float(max_side_len) / resize_h
+        resize_h = int(resize_h * ratio)
+        resize_w = int(resize_w * ratio)
+
+        max_stride = 128
+        resize_h = (resize_h + max_stride - 1) // max_stride * max_stride
+        resize_w = (resize_w + max_stride - 1) // max_stride * max_stride
+        im = cv2.resize(im, (int(resize_w), int(resize_h)))
+        ratio_h = resize_h / float(h)
+        ratio_w = resize_w / float(w)
+        return im, (ratio_h, ratio_w)
+
+    def resize_image(self, im, max_side_len=512):
+        """
+        resize image to a size multiple of max_stride which is required by the network
+        :param im: the resized image
+        :param max_side_len: limit of max image size to avoid out of memory in gpu
+        :return: the resized image and the resize ratio
+        """
+        h, w, _ = im.shape
+
+        resize_w = w
+        resize_h = h
+
+        # Fix the longer side
+        if resize_h > resize_w:
+            ratio = float(max_side_len) / resize_h
+        else:
+            ratio = float(max_side_len) / resize_w
+
+        resize_h = int(resize_h * ratio)
+        resize_w = int(resize_w * ratio)
+
+        max_stride = 128
+        resize_h = (resize_h + max_stride - 1) // max_stride * max_stride
+        resize_w = (resize_w + max_stride - 1) // max_stride * max_stride
+        im = cv2.resize(im, (int(resize_w), int(resize_h)))
+        ratio_h = resize_h / float(h)
+        ratio_w = resize_w / float(w)
+
+        return im, (ratio_h, ratio_w)
+
+
+
+class Pad_to_max_len(object):
+    def __init__(self, **kwargs):
+        super(Pad_to_max_len, self).__init__()
+        self.max_h = kwargs['max_h']
+        self.max_w = kwargs['max_w']
+
+    def __call__(self, data):
+        img = data['image']
+        if img.shape[-1] == 3:
+            # hwc
+            if img.shape[0]!= self.max_h:
+                # TODO support 
+                # assert False, "not support"
+                pad_h = self.max_h - img.shape[0]
+                pad_w = self.max_w - img.shape[1]
+                img = np.pad(img, ((0, pad_h), (0, pad_w), (0, 0)), 'constant', constant_values=0)
+            if img.shape[1] < self.max_w:
+                pad_w = self.max_w - img.shape[1]
+                img = np.pad(img, ((0, 0), (0, pad_w), (0, 0)), 'constant', constant_values=0)
+
+        elif img.shape[0] == 3:
+            # chw
+            img = img.transpose((1, 2, 0))
+            if img.shape[1]!= self.max_h:
+                # TODO support 
+                assert False, "not support"
+            if img.shape[0] < self.max_w:
+                pad_w = self.max_w - img.shape[0]
+                img = np.pad(img, ((0, 0), (0, 0), (0, pad_w)), 'constant', constant_values=0)
+
+        else:
+            assert False, "not support"
+
+        data['image'] = img
+
+        return data
\ No newline at end of file
diff --git a/examples/PPOCR/PPOCR-Rec/README.md b/examples/PPOCR/PPOCR-Rec/README.md
index 5d7d1cc..169f400 100644
--- a/examples/PPOCR/PPOCR-Rec/README.md
+++ b/examples/PPOCR/PPOCR-Rec/README.md
@@ -1,5 +1,11 @@
 # PPOCR-Rec
 
+## Current Support Platform
+
+RK3566, RK3568, RK3588, RK3562, RK1808, RV1109, RV1126
+
+
+
 ## Download ONNX model
 
 Download link: 
@@ -35,36 +41,34 @@ python convert.py <onnx_model> <TARGET_PLATFORM> <dtype(optional)> <output_rknn_
 *Description:*
 
 - <onnx_model> should be the ONNX model path.
-- <TARGET_PLATFORM>  could be specified as RK3562, RK3566, RK3568, RK3588 according to board SOC version.
-- <dtype\> is *optional*, could be specified as `i8` or `fp`, `i8` means to do quantization, `fp` means no to do quantization, default is `fp`.
+- <TARGET_PLATFORM>  could be specified as RK3562, RK3566, RK3568, RK3588, RK1808, RV1109, RV1126 according to board SOC version.
+- <dtype> is *optional*, could be specified as `fp`, `fp` means no to do quantization, default is `fp`.
 - <output_rknn_path> is **optional**, used to specify the saving path of the RKNN model, default save path is `../model/ppocrv4_rec.rknn`
 
 
+## Python Demo
 
-## Script Usage
+*Usage:*
 
-For ONNX:
+```shell
+cd python
+
+# Inference with ONNX model
+python ppocr_rec.py --model_path <onnx_model>
+# such as: python ppocr_rec.py --model_path ../model/ppocrv4_rec.onnx 
 
-```bash
-pip install -r python/requirements.txt
-python python/ppocr_rec.py \
-    --image_dir model/word_1.png \
-    --rec_model_dir model/ch_PP-OCRv4_rec_infer/ppocrv4_rec.onnx \
-    --rec_char_dict_path model/ppocr_keys_v1.txt \
-    --use_gpu false --use_onnx true --rec_image_shape "3, 48, 320"
+# Inference with RKNN model
+python ppocrv4_rec.py --model_path <rknn_model> --target <TARGET_PLATFORM>
+# such as: python ppocrv4_rec.py --model_path ../model/ppocrv4_rec.rknn --target rk3588
 ```
+*Description:*
+- <TARGET_PLATFORM>: Specify NPU platform name. Such as 'rk3588'.
 
-For RKNN:
+- <onnx_model / rknn_model>: specified as the model path.
 
-```bash
-python python/ppocr_rec.py \
-    --image_dir model/word_1.png \
-    --rec_model_dir model/ch_PP-OCRv4_rec_infer/ppocrv4_rec.rknn \
-    --rec_char_dict_path model/ppocr_keys_v1.txt \
-    --use_gpu false --use_rknn true --platform rk3568 --rec_image_shape "3, 48, 320"
-```
 
 ## Android Demo
+**Note: RK1808, RV1109, RV1126 does not support Android.**
 
 ### Compiling && Building
 
@@ -100,7 +104,7 @@ adb shell
 cd /data/rknn_PPOCR-Rec_demo
 
 export LD_LIBRARY_PATH=./lib
-./rknn_ppocr_rec_demo model/ppocrv4_rec.rknn model/word_1.png
+./rknn_ppocr_rec_demo model/ppocrv4_rec.rknn model/test.png
 ```
 
 ## Aarch64 Linux Demo
@@ -143,7 +147,7 @@ adb shell
 cd /data/rknn_PPOCR-Rec_demo
 
 export LD_LIBRARY_PATH=./lib
-./rknn_ppocr_rec_demo model/ppocrv4_rec.rknn model/word_1.png
+./rknn_ppocr_rec_demo model/ppocrv4_rec.rknn model/test.png
 ```
 
 Note: Try searching the location of librga.so and add it to LD_LIBRARY_PATH if the librga.so is not found in the lib folder.
diff --git a/examples/PPOCR/PPOCR-Rec/cpp/CMakeLists.txt b/examples/PPOCR/PPOCR-Rec/cpp/CMakeLists.txt
index b624f2b..4d22877 100644
--- a/examples/PPOCR/PPOCR-Rec/cpp/CMakeLists.txt
+++ b/examples/PPOCR/PPOCR-Rec/cpp/CMakeLists.txt
@@ -41,12 +41,19 @@ message(STATUS OpenCV_LIBS=${OpenCV_LIBS})
 
 set(CMAKE_INSTALL_RPATH "$ORIGIN/../lib")
 
+if (TARGET_SOC STREQUAL "rk1808" OR TARGET_SOC STREQUAL "rv1109" OR TARGET_SOC STREQUAL "rv1126")
+    set(ppocr_rec_file rknpu1/ppocr_rec.cc)
+else()
+    set(ppocr_rec_file rknpu2/ppocr_rec.cc)
+endif()
+
+
 file(GLOB SRCS ${CMAKE_CURRENT_SOURCE_DIR}/*.cc)
 
 add_executable(${PROJECT_NAME}
     main.cc
     postprocess.cc
-    rknpu2/ppocr_rec.cc
+    ${ppocr_rec_file}
 )
 
 target_link_libraries(${PROJECT_NAME}
@@ -73,6 +80,6 @@ target_include_directories(${PROJECT_NAME} PRIVATE
 )
 
 install(TARGETS ${PROJECT_NAME} DESTINATION .)
-install(FILES ${CMAKE_CURRENT_SOURCE_DIR}/../model/word_1.png DESTINATION model)
+install(FILES ${CMAKE_CURRENT_SOURCE_DIR}/../model/test.png DESTINATION model)
 file(GLOB RKNN_FILES "${CMAKE_CURRENT_SOURCE_DIR}/../model/*.rknn")
 install(FILES ${RKNN_FILES} DESTINATION model)
\ No newline at end of file
diff --git a/examples/PPOCR/PPOCR-Rec/cpp/rknpu1/ppocr_rec.cc b/examples/PPOCR/PPOCR-Rec/cpp/rknpu1/ppocr_rec.cc
new file mode 100644
index 0000000..e7fc093
--- /dev/null
+++ b/examples/PPOCR/PPOCR-Rec/cpp/rknpu1/ppocr_rec.cc
@@ -0,0 +1,192 @@
+#include <stdio.h>
+#include <stdlib.h>
+#include "opencv2/opencv.hpp"
+
+#include "ppocr_rec.h"
+#include "common.h"
+#include "file_utils.h"
+#include "image_utils.h"
+
+static void dump_tensor_attr(rknn_tensor_attr* attr)
+{
+    printf("  index=%d, name=%s, n_dims=%d, dims=[%d, %d, %d, %d], n_elems=%d, size=%d, fmt=%s, type=%s, qnt_type=%s, "
+            "zp=%d, scale=%f\n",
+            attr->index, attr->name, attr->n_dims, attr->dims[3], attr->dims[2], attr->dims[1], attr->dims[0],
+            attr->n_elems, attr->size, get_format_string(attr->fmt), get_type_string(attr->type),
+            get_qnt_type_string(attr->qnt_type), attr->zp, attr->scale);
+}
+
+int init_ppocr_rec_model(const char* model_path, rknn_app_context_t* app_ctx)
+{
+    int ret;
+    int model_len = 0;
+    char* model;
+    rknn_context ctx = 0;
+
+    // Load RKNN Model
+    model_len = read_data_from_file(model_path, &model);
+    if (model == NULL) {
+        printf("load_model fail!\n");
+        return -1;
+    }
+
+    ret = rknn_init(&ctx, model, model_len, 0);
+    free(model);
+    if (ret < 0) {
+        printf("rknn_init fail! ret=%d\n", ret);
+        return -1;
+    }
+
+    // Get Model Input Output Number
+    rknn_input_output_num io_num;
+    ret = rknn_query(ctx, RKNN_QUERY_IN_OUT_NUM, &io_num, sizeof(io_num));
+    if (ret != RKNN_SUCC) {
+        printf("rknn_query fail! ret=%d\n", ret);
+        return -1;
+    }
+    printf("model input num: %d, output num: %d\n", io_num.n_input, io_num.n_output);
+
+    // Get Model Input Info
+    printf("input tensors:\n");
+    rknn_tensor_attr input_attrs[io_num.n_input];
+    memset(input_attrs, 0, sizeof(input_attrs));
+    for (int i = 0; i < io_num.n_input; i++) {
+        input_attrs[i].index = i;
+        ret = rknn_query(ctx, RKNN_QUERY_INPUT_ATTR, &(input_attrs[i]), sizeof(rknn_tensor_attr));
+        if (ret != RKNN_SUCC) {
+            printf("rknn_query fail! ret=%d\n", ret);
+            return -1;
+        }
+        dump_tensor_attr(&(input_attrs[i]));
+    }
+
+    // Get Model Output Info
+    printf("output tensors:\n");
+    rknn_tensor_attr output_attrs[io_num.n_output];
+    memset(output_attrs, 0, sizeof(output_attrs));
+    for (int i = 0; i < io_num.n_output; i++) {
+        output_attrs[i].index = i;
+        ret = rknn_query(ctx, RKNN_QUERY_OUTPUT_ATTR, &(output_attrs[i]), sizeof(rknn_tensor_attr));
+        if (ret != RKNN_SUCC) {
+            printf("rknn_query fail! ret=%d\n", ret);
+            return -1;
+        }
+        dump_tensor_attr(&(output_attrs[i]));
+    }
+
+    // Set to context
+    app_ctx->rknn_ctx = ctx;
+    app_ctx->io_num = io_num;
+    app_ctx->input_attrs = (rknn_tensor_attr*)malloc(io_num.n_input * sizeof(rknn_tensor_attr));
+    memcpy(app_ctx->input_attrs, input_attrs, io_num.n_input * sizeof(rknn_tensor_attr));
+    app_ctx->output_attrs = (rknn_tensor_attr*)malloc(io_num.n_output * sizeof(rknn_tensor_attr));
+    memcpy(app_ctx->output_attrs, output_attrs, io_num.n_output * sizeof(rknn_tensor_attr));
+
+    if (input_attrs[0].fmt == RKNN_TENSOR_NCHW) {
+        printf("model is NCHW input fmt\n");
+        app_ctx->model_channel = input_attrs[0].dims[2];
+        app_ctx->model_height  = input_attrs[0].dims[1];
+        app_ctx->model_width   = input_attrs[0].dims[0];
+    } else {
+        printf("model is NHWC input fmt\n");
+        app_ctx->model_height = input_attrs[0].dims[2];
+        app_ctx->model_width = input_attrs[0].dims[1];
+        app_ctx->model_channel = input_attrs[0].dims[0];
+    }
+    printf("model input height=%d, width=%d, channel=%d\n",
+        app_ctx->model_height, app_ctx->model_width, app_ctx->model_channel);
+
+    return 0;
+}
+
+int release_ppocr_rec_model(rknn_app_context_t* app_ctx)
+{
+    if (app_ctx->input_attrs != NULL) {
+        free(app_ctx->input_attrs);
+        app_ctx->input_attrs = NULL;
+    }
+    if (app_ctx->output_attrs != NULL) {
+        free(app_ctx->output_attrs);
+        app_ctx->output_attrs = NULL;
+    }
+    if (app_ctx->rknn_ctx != 0) {
+        rknn_destroy(app_ctx->rknn_ctx);
+        app_ctx->rknn_ctx = 0;
+    }
+    return 0;
+}
+
+int inference_ppocr_rec_model(rknn_app_context_t* app_ctx, image_buffer_t* src_img, ppocr_rec_result* out_result)
+{
+    int ret;
+    rknn_input inputs[1];
+    rknn_output outputs[1];
+    int allow_slight_change = 1;
+
+    memset(inputs, 0, sizeof(inputs));
+    memset(outputs, 0, sizeof(outputs));
+
+    // Pre Process
+    float ratio = src_img->width / float(src_img->height);
+    int resized_w;
+    int imgW = app_ctx->model_width, imgH = app_ctx->model_height;
+    if (std::ceil(imgH*ratio) > imgW) {
+        resized_w = imgW;
+    }
+    else {
+        resized_w = std::ceil(imgH*ratio);
+    }
+
+    cv::Mat img_M = cv::Mat(src_img->height, src_img->width, CV_8UC3,(uint8_t*)src_img->virt_addr);
+    cv::resize(img_M, img_M, cv::Size(resized_w, imgH));
+    img_M.convertTo(img_M, CV_32FC3);
+    img_M = (img_M - 127.5)/127.5;
+    if (resized_w < imgW) {
+        copyMakeBorder(img_M, img_M, 0, 0, 0, imgW- resized_w, cv::BORDER_CONSTANT, 0);
+    }
+
+    // Set Input Data
+    inputs[0].index = 0;
+    inputs[0].type  = RKNN_TENSOR_FLOAT32;
+    inputs[0].fmt   = RKNN_TENSOR_NHWC;
+    inputs[0].size  = app_ctx->model_width * app_ctx->model_height * app_ctx->model_channel * sizeof(float);
+    // inputs[0].buf   = img.virt_addr;
+    inputs[0].buf = malloc(inputs[0].size);
+    memcpy(inputs[0].buf, img_M.data, inputs[0].size);
+
+    ret = rknn_inputs_set(app_ctx->rknn_ctx, 1, inputs);
+    if (ret < 0) {
+        printf("rknn_input_set fail! ret=%d\n", ret);
+        return -1;
+    }
+
+    // Run
+    printf("rknn_run\n");
+    ret = rknn_run(app_ctx->rknn_ctx, nullptr);
+    if (ret < 0) {
+        printf("rknn_run fail! ret=%d\n", ret);
+        return -1;
+    }
+
+    // Get Output
+    int out_len_seq = app_ctx->model_width / 8;
+    outputs[0].want_float = 1;
+    ret = rknn_outputs_get(app_ctx->rknn_ctx, 1, outputs, NULL);
+    if (ret < 0) {
+        printf("rknn_outputs_get fail! ret=%d\n", ret);
+        goto out;
+    }
+
+    // Post Process
+    ret = rec_postprocess((float*)outputs[0].buf, MODEL_OUT_CHANNEL, out_len_seq, out_result);
+    
+    // Remeber to release rknn output
+    rknn_outputs_release(app_ctx->rknn_ctx, 1, outputs);
+
+out:
+    if (inputs[0].buf != NULL) {
+        free(inputs[0].buf);
+    }
+
+    return ret;
+}
\ No newline at end of file
diff --git a/examples/PPOCR/PPOCR-Rec/cpp/rknpu2/ppocr_rec.cc b/examples/PPOCR/PPOCR-Rec/cpp/rknpu2/ppocr_rec.cc
index 24c773b..ec6316e 100644
--- a/examples/PPOCR/PPOCR-Rec/cpp/rknpu2/ppocr_rec.cc
+++ b/examples/PPOCR/PPOCR-Rec/cpp/rknpu2/ppocr_rec.cc
@@ -101,10 +101,6 @@ int init_ppocr_rec_model(const char* model_path, rknn_app_context_t* app_ctx)
 
 int release_ppocr_rec_model(rknn_app_context_t* app_ctx)
 {
-    if (app_ctx->rknn_ctx != 0) {
-        rknn_destroy(app_ctx->rknn_ctx);
-        app_ctx->rknn_ctx = 0;
-    }
     if (app_ctx->input_attrs != NULL) {
         free(app_ctx->input_attrs);
         app_ctx->input_attrs = NULL;
@@ -113,6 +109,10 @@ int release_ppocr_rec_model(rknn_app_context_t* app_ctx)
         free(app_ctx->output_attrs);
         app_ctx->output_attrs = NULL;
     }
+    if (app_ctx->rknn_ctx != 0) {
+        rknn_destroy(app_ctx->rknn_ctx);
+        app_ctx->rknn_ctx = 0;
+    }
     return 0;
 }
 
diff --git a/examples/PPOCR/PPOCR-Rec/model/word_1.png b/examples/PPOCR/PPOCR-Rec/model/test.png
similarity index 100%
rename from examples/PPOCR/PPOCR-Rec/model/word_1.png
rename to examples/PPOCR/PPOCR-Rec/model/test.png
diff --git a/examples/PPOCR/PPOCR-Rec/python/config.yaml b/examples/PPOCR/PPOCR-Rec/python/config.yaml
deleted file mode 100644
index 0744a19..0000000
--- a/examples/PPOCR/PPOCR-Rec/python/config.yaml
+++ /dev/null
@@ -1,7 +0,0 @@
-mean:
-std:
-model_path: ./model/ch_PP-OCRv4_rec_infer/ppocrv4_rec.onnx
-outputs_nodes:
-do_quantization: False
-dataset:
-output_folder: "./model/ch_PP-OCRv4_rec_infer"
diff --git a/examples/PPOCR/PPOCR-Rec/python/convert.py b/examples/PPOCR/PPOCR-Rec/python/convert.py
index dcf326a..3940c5c 100644
--- a/examples/PPOCR/PPOCR-Rec/python/convert.py
+++ b/examples/PPOCR/PPOCR-Rec/python/convert.py
@@ -11,21 +11,19 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
-import os
 import sys
-from tabnanny import verbose
 from rknn.api import RKNN
 
-
-DATASET_PATH = '../../../../datasets/PPOCR/imgs/dataset_20.txt'
 DEFAULT_RKNN_PATH = '../model/ppocrv4_rec.rknn'
 DEFAULT_QUANT = False
+RKNPU1_PLATFORM = ['rk1808', 'rv1109', 'rv1126']
 
 def parse_arg():
     if len(sys.argv) < 3:
         print("Usage: python3 {} onnx_model_path [platform] [dtype(optional)] [output_rknn_path(optional)]".format(sys.argv[0]));
-        print("       platform choose from [rk3562,rk3566,rk3568,rk3588]")
-        print("       dtype choose from    [i8, fp]")
+        print("       platform choose from [rk3562, rk3566, rk3568, rk3588, rk1808, rv1109, rv1126]")
+        print("       dtype choose from    [fp] for [rk3562,rk3566,rk3568,rk3588]")
+        print("       dtype choose from    [fp] for [rk1808,rv1109,rv1126]")
         exit(1)
 
     model_path = sys.argv[1]
@@ -34,14 +32,9 @@ def parse_arg():
     do_quant = DEFAULT_QUANT
     if len(sys.argv) > 3:
         model_type = sys.argv[3]
-        if model_type not in ['i8', 'fp']:
+        if model_type not in ['fp']:
             print("ERROR: Invalid model type: {}".format(model_type))
             exit(1)
-        elif model_type == 'i8':
-            assert False, "i8 PPOCR-Rec got accuracy drop yet!"
-            do_quant = True
-        else:
-            do_quant = False
 
     if len(sys.argv) > 4:
         output_path = sys.argv[4]
@@ -53,33 +46,46 @@ def parse_arg():
 if __name__ == '__main__':
     model_path, platform, do_quant, output_path = parse_arg()
 
-    model = RKNN(verbose=False)
+    # Create RKNN object
+    rknn = RKNN(verbose=False)
+
+    # Pre-process config
+    print('--> Config model')
+    if platform in RKNPU1_PLATFORM:
+        rknn.config(target_platform=platform)
+    else:
+        rknn.config(
+            target_platform=platform,
+            # In order to improve accuracy, these nodes need to fallback to CPU on RKNPU2 platform.
+            op_target={'p2o.Add.235_shape4':'cpu', 'p2o.Add.245_shape4':'cpu', 'p2o.Add.255_shape4':'cpu',
+                    'p2o.Add.265_shape4':'cpu', 'p2o.Add.275_shape4':'cpu'}
+        )
 
-    # Config
-    model.config(
-        target_platform=platform,
-        op_target={'p2o.Add.235_shape4':'cpu', 'p2o.Add.245_shape4':'cpu', 'p2o.Add.255_shape4':'cpu',
-                   'p2o.Add.265_shape4':'cpu', 'p2o.Add.275_shape4':'cpu'}
-    )
+    print('done')
 
-    # Load ONNX model
-    ret = model.load_onnx(model=model_path)
-    assert ret == 0, "Load model failed!"
+    # Load model
+    print('--> Loading model')
+    ret = rknn.load_onnx(model=model_path)
+    if ret != 0:
+        print('Load model failed!')
+        exit(ret)
+    print('done')
 
     # Build model
-    ret = model.build(
-        do_quantization=do_quant)
-    assert ret == 0, "Build model failed!"
-
-    # Init Runtime
-    # ret = model.init_runtime()
-    # assert ret == 0, "Init runtime environment failed!"
+    print('--> Building model')
+    ret = rknn.build(do_quantization=do_quant)
+    if ret != 0:
+        print('Build model failed!')
+        exit(ret)
+    print('done')
 
-    # Export
-    if not os.path.exists(os.path.dirname(output_path)):
-        os.mkdir(os.path.dirname(output_path))
+    # Export rknn model
+    print('--> Export rknn model')
+    ret = rknn.export_rknn(output_path)
+    if ret != 0:
+        print('Export rknn model failed!')
+        exit(ret)
+    print('done')
 
-    ret = model.export_rknn(
-        output_path)
-    assert ret == 0, "Export rknn model failed!"
-    print("Export OK!")
+    # Release
+    rknn.release()
\ No newline at end of file
diff --git a/examples/PPOCR/PPOCR-Rec/python/ppocr_rec.py b/examples/PPOCR/PPOCR-Rec/python/ppocr_rec.py
index 9bf7103..5864718 100644
--- a/examples/PPOCR/PPOCR-Rec/python/ppocr_rec.py
+++ b/examples/PPOCR/PPOCR-Rec/python/ppocr_rec.py
@@ -13,680 +13,105 @@
 # limitations under the License.
 import os
 import sys
-from PIL import Image
-__dir__ = os.path.dirname(os.path.abspath(__file__))
-sys.path.append(__dir__)
-sys.path.insert(0, os.path.abspath(os.path.join(__dir__, '../..')))
-
-os.environ["FLAGS_allocator_strategy"] = 'auto_growth'
-
+import argparse
 import cv2
 import numpy as np
-import math
-import time
-import traceback
+import utils.operators
+from utils.rec_postprocess import CTCLabelDecode
 
-import utility
-from paddleocr.ppocr.postprocess import build_post_process
-from paddleocr.ppocr.utils.logging import get_logger
-from paddleocr.ppocr.utils.utility import get_image_file_list, check_and_read
+# add path
+realpath = os.path.abspath(__file__)
+_sep = os.path.sep
+realpath = realpath.split(_sep)
+sys.path.append(os.path.join(realpath[0]+_sep, *realpath[1:realpath.index('rknn_model_zoo')+1]))
 
-logger = get_logger()
+os.environ["FLAGS_allocator_strategy"] = 'auto_growth'
 
+REC_INPUT_SHAPE = [48, 320] # h,w
+CHARACTER_DICT_PATH= '../model/ppocr_keys_v1.txt'
 
-class TextRecognizer(object):
-    def __init__(self, args):
-        self.rec_image_shape = [int(v) for v in args.rec_image_shape.split(",")]
-        self.rec_batch_num = args.rec_batch_num
-        self.rec_algorithm = args.rec_algorithm
-        postprocess_params = {
-            'name': 'CTCLabelDecode',
-            "character_dict_path": args.rec_char_dict_path,
-            "use_space_char": args.use_space_char
-        }
-        if self.rec_algorithm == "SRN":
-            postprocess_params = {
-                'name': 'SRNLabelDecode',
-                "character_dict_path": args.rec_char_dict_path,
-                "use_space_char": args.use_space_char
-            }
-        elif self.rec_algorithm == "RARE":
-            postprocess_params = {
-                'name': 'AttnLabelDecode',
-                "character_dict_path": args.rec_char_dict_path,
-                "use_space_char": args.use_space_char
-            }
-        elif self.rec_algorithm == 'NRTR':
-            postprocess_params = {
-                'name': 'NRTRLabelDecode',
-                "character_dict_path": args.rec_char_dict_path,
-                "use_space_char": args.use_space_char
-            }
-        elif self.rec_algorithm == "SAR":
-            postprocess_params = {
-                'name': 'SARLabelDecode',
-                "character_dict_path": args.rec_char_dict_path,
-                "use_space_char": args.use_space_char
-            }
-        elif self.rec_algorithm == "VisionLAN":
-            postprocess_params = {
-                'name': 'VLLabelDecode',
-                "character_dict_path": args.rec_char_dict_path,
-                "use_space_char": args.use_space_char
+PRE_PROCESS_CONFIG = [ 
+        {
+            'NormalizeImage': {
+                'std': [1, 1, 1],
+                'mean': [0, 0, 0],
+                'scale': '1./255.',
+                'order': 'hwc'
             }
-        elif self.rec_algorithm == 'ViTSTR':
-            postprocess_params = {
-                'name': 'ViTSTRLabelDecode',
-                "character_dict_path": args.rec_char_dict_path,
-                "use_space_char": args.use_space_char
-            }
-        elif self.rec_algorithm == 'ABINet':
-            postprocess_params = {
-                'name': 'ABINetLabelDecode',
-                "character_dict_path": args.rec_char_dict_path,
-                "use_space_char": args.use_space_char
-            }
-        elif self.rec_algorithm == "SPIN":
-            postprocess_params = {
-                'name': 'SPINLabelDecode',
-                "character_dict_path": args.rec_char_dict_path,
-                "use_space_char": args.use_space_char
-            }
-        elif self.rec_algorithm == "RobustScanner":
-            postprocess_params = {
-                'name': 'SARLabelDecode',
-                "character_dict_path": args.rec_char_dict_path,
-                "use_space_char": args.use_space_char,
-                "rm_symbol": True
-            }
-        elif self.rec_algorithm == 'RFL':
-            postprocess_params = {
-                'name': 'RFLLabelDecode',
-                "character_dict_path": None,
-                "use_space_char": args.use_space_char
-            }
-        elif self.rec_algorithm == "SATRN":
-            postprocess_params = {
-                'name': 'SATRNLabelDecode',
-                "character_dict_path": args.rec_char_dict_path,
-                "use_space_char": args.use_space_char,
-                "rm_symbol": True
-            }
-        elif self.rec_algorithm == "PREN":
-            postprocess_params = {'name': 'PRENLabelDecode'}
-        elif self.rec_algorithm == "CAN":
-            self.inverse = args.rec_image_inverse
-            postprocess_params = {
-                'name': 'CANLabelDecode',
-                "character_dict_path": args.rec_char_dict_path,
-                "use_space_char": args.use_space_char
-            }
-        self.postprocess_op = build_post_process(postprocess_params)
-        self.predictor, self.input_tensor, self.output_tensors, self.config = \
-            utility.create_predictor(args, 'rec', logger)
-        self.benchmark = args.benchmark
-        self.use_onnx = args.use_onnx
-        self.use_rknn = args.use_rknn
-        if args.benchmark:
-            import auto_log
-            pid = os.getpid()
-            gpu_id = utility.get_infer_gpuid()
-            self.autolog = auto_log.AutoLogger(
-                model_name="rec",
-                model_precision=args.precision,
-                batch_size=args.rec_batch_num,
-                data_shape="dynamic",
-                save_path=None,  #args.save_log_path,
-                inference_config=self.config,
-                pids=pid,
-                process_name=None,
-                gpu_ids=gpu_id if args.use_gpu else None,
-                time_keys=[
-                    'preprocess_time', 'inference_time', 'postprocess_time'
-                ],
-                warmup=0,
-                logger=logger)
-
-    def resize_norm_img(self, img, max_wh_ratio):
-        imgC, imgH, imgW = self.rec_image_shape
-        if self.rec_algorithm == 'NRTR' or self.rec_algorithm == 'ViTSTR':
-            img = cv2.cvtColor(img, cv2.COLOR_BGR2GRAY)
-            # return padding_im
-            image_pil = Image.fromarray(np.uint8(img))
-            if self.rec_algorithm == 'ViTSTR':
-                img = image_pil.resize([imgW, imgH], Image.BICUBIC)
-            else:
-                img = image_pil.resize([imgW, imgH], Image.LANCZOS)
-            img = np.array(img)
-            norm_img = np.expand_dims(img, -1)
-            norm_img = norm_img.transpose((2, 0, 1))
-            if self.rec_algorithm == 'ViTSTR':
-                norm_img = norm_img.astype(np.float32) / 255.
-            else:
-                norm_img = norm_img.astype(np.float32) / 128. - 1.
-            return norm_img
-        elif self.rec_algorithm == 'RFL':
-            img = cv2.cvtColor(img, cv2.COLOR_BGR2GRAY)
-            resized_image = cv2.resize(
-                img, (imgW, imgH), interpolation=cv2.INTER_CUBIC)
-            resized_image = resized_image.astype('float32')
-            resized_image = resized_image / 255
-            resized_image = resized_image[np.newaxis, :]
-            resized_image -= 0.5
-            resized_image /= 0.5
-            return resized_image
-
-        assert imgC == img.shape[2]
-        imgW = int((imgH * max_wh_ratio))
-        if self.use_onnx:
-            w = self.input_tensor.shape[3:][0]
-            if isinstance(w, str):
-                pass
-            elif w is not None and w > 0:
-                imgW = w
-        if self.use_rknn:
-            imgW = self.rec_image_shape[2]
-        h, w = img.shape[:2]
-        ratio = w / float(h)
-        if math.ceil(imgH * ratio) > imgW:
-            resized_w = imgW
-        else:
-            resized_w = int(math.ceil(imgH * ratio))
-        if self.rec_algorithm == 'RARE':
-            if resized_w > self.rec_image_shape[2]:
-                resized_w = self.rec_image_shape[2]
-            imgW = self.rec_image_shape[2]
-        resized_image = cv2.resize(img, (resized_w, imgH))
-        resized_image = resized_image.astype('float32')
-        resized_image = resized_image.transpose((2, 0, 1)) / 255
-        resized_image -= 0.5
-        resized_image /= 0.5
-        padding_im = np.zeros((imgC, imgH, imgW), dtype=np.float32)
-        padding_im[:, :, 0:resized_w] = resized_image
-        return padding_im
-
-    def resize_norm_img_vl(self, img, image_shape):
-
-        imgC, imgH, imgW = image_shape
-        img = img[:, :, ::-1]  # bgr2rgb
-        resized_image = cv2.resize(
-            img, (imgW, imgH), interpolation=cv2.INTER_LINEAR)
-        resized_image = resized_image.astype('float32')
-        resized_image = resized_image.transpose((2, 0, 1)) / 255
-        return resized_image
-
-    def resize_norm_img_srn(self, img, image_shape):
-        imgC, imgH, imgW = image_shape
-
-        img_black = np.zeros((imgH, imgW))
-        im_hei = img.shape[0]
-        im_wid = img.shape[1]
-
-        if im_wid <= im_hei * 1:
-            img_new = cv2.resize(img, (imgH * 1, imgH))
-        elif im_wid <= im_hei * 2:
-            img_new = cv2.resize(img, (imgH * 2, imgH))
-        elif im_wid <= im_hei * 3:
-            img_new = cv2.resize(img, (imgH * 3, imgH))
-        else:
-            img_new = cv2.resize(img, (imgW, imgH))
-
-        img_np = np.asarray(img_new)
-        img_np = cv2.cvtColor(img_np, cv2.COLOR_BGR2GRAY)
-        img_black[:, 0:img_np.shape[1]] = img_np
-        img_black = img_black[:, :, np.newaxis]
-
-        row, col, c = img_black.shape
-        c = 1
-
-        return np.reshape(img_black, (c, row, col)).astype(np.float32)
-
-    def srn_other_inputs(self, image_shape, num_heads, max_text_length):
-
-        imgC, imgH, imgW = image_shape
-        feature_dim = int((imgH / 8) * (imgW / 8))
-
-        encoder_word_pos = np.array(range(0, feature_dim)).reshape(
-            (feature_dim, 1)).astype('int64')
-        gsrm_word_pos = np.array(range(0, max_text_length)).reshape(
-            (max_text_length, 1)).astype('int64')
-
-        gsrm_attn_bias_data = np.ones((1, max_text_length, max_text_length))
-        gsrm_slf_attn_bias1 = np.triu(gsrm_attn_bias_data, 1).reshape(
-            [-1, 1, max_text_length, max_text_length])
-        gsrm_slf_attn_bias1 = np.tile(
-            gsrm_slf_attn_bias1,
-            [1, num_heads, 1, 1]).astype('float32') * [-1e9]
-
-        gsrm_slf_attn_bias2 = np.tril(gsrm_attn_bias_data, -1).reshape(
-            [-1, 1, max_text_length, max_text_length])
-        gsrm_slf_attn_bias2 = np.tile(
-            gsrm_slf_attn_bias2,
-            [1, num_heads, 1, 1]).astype('float32') * [-1e9]
-
-        encoder_word_pos = encoder_word_pos[np.newaxis, :]
-        gsrm_word_pos = gsrm_word_pos[np.newaxis, :]
-
-        return [
-            encoder_word_pos, gsrm_word_pos, gsrm_slf_attn_bias1,
-            gsrm_slf_attn_bias2
+        }
         ]
 
-    def process_image_srn(self, img, image_shape, num_heads, max_text_length):
-        norm_img = self.resize_norm_img_srn(img, image_shape)
-        norm_img = norm_img[np.newaxis, :]
-
-        [encoder_word_pos, gsrm_word_pos, gsrm_slf_attn_bias1, gsrm_slf_attn_bias2] = \
-            self.srn_other_inputs(image_shape, num_heads, max_text_length)
-
-        gsrm_slf_attn_bias1 = gsrm_slf_attn_bias1.astype(np.float32)
-        gsrm_slf_attn_bias2 = gsrm_slf_attn_bias2.astype(np.float32)
-        encoder_word_pos = encoder_word_pos.astype(np.int64)
-        gsrm_word_pos = gsrm_word_pos.astype(np.int64)
-
-        return (norm_img, encoder_word_pos, gsrm_word_pos, gsrm_slf_attn_bias1,
-                gsrm_slf_attn_bias2)
-
-    def resize_norm_img_sar(self, img, image_shape,
-                            width_downsample_ratio=0.25):
-        imgC, imgH, imgW_min, imgW_max = image_shape
-        h = img.shape[0]
-        w = img.shape[1]
-        valid_ratio = 1.0
-        # make sure new_width is an integral multiple of width_divisor.
-        width_divisor = int(1 / width_downsample_ratio)
-        # resize
-        ratio = w / float(h)
-        resize_w = math.ceil(imgH * ratio)
-        if resize_w % width_divisor != 0:
-            resize_w = round(resize_w / width_divisor) * width_divisor
-        if imgW_min is not None:
-            resize_w = max(imgW_min, resize_w)
-        if imgW_max is not None:
-            valid_ratio = min(1.0, 1.0 * resize_w / imgW_max)
-            resize_w = min(imgW_max, resize_w)
-        resized_image = cv2.resize(img, (resize_w, imgH))
-        resized_image = resized_image.astype('float32')
-        # norm
-        if image_shape[0] == 1:
-            resized_image = resized_image / 255
-            resized_image = resized_image[np.newaxis, :]
-        else:
-            resized_image = resized_image.transpose((2, 0, 1)) / 255
-        resized_image -= 0.5
-        resized_image /= 0.5
-        resize_shape = resized_image.shape
-        padding_im = -1.0 * np.ones((imgC, imgH, imgW_max), dtype=np.float32)
-        padding_im[:, :, 0:resize_w] = resized_image
-        pad_shape = padding_im.shape
-
-        return padding_im, resize_shape, pad_shape, valid_ratio
-
-    def resize_norm_img_spin(self, img):
-        img = cv2.cvtColor(img, cv2.COLOR_BGR2GRAY)
-        # return padding_im
-        img = cv2.resize(img, tuple([100, 32]), cv2.INTER_CUBIC)
-        img = np.array(img, np.float32)
-        img = np.expand_dims(img, -1)
-        img = img.transpose((2, 0, 1))
-        mean = [127.5]
-        std = [127.5]
-        mean = np.array(mean, dtype=np.float32)
-        std = np.array(std, dtype=np.float32)
-        mean = np.float32(mean.reshape(1, -1))
-        stdinv = 1 / np.float32(std.reshape(1, -1))
-        img -= mean
-        img *= stdinv
-        return img
-
-    def resize_norm_img_svtr(self, img, image_shape):
-
-        imgC, imgH, imgW = image_shape
-        resized_image = cv2.resize(
-            img, (imgW, imgH), interpolation=cv2.INTER_LINEAR)
-        resized_image = resized_image.astype('float32')
-        resized_image = resized_image.transpose((2, 0, 1)) / 255
-        resized_image -= 0.5
-        resized_image /= 0.5
-        return resized_image
-
-    def resize_norm_img_abinet(self, img, image_shape):
-
-        imgC, imgH, imgW = image_shape
-
-        resized_image = cv2.resize(
-            img, (imgW, imgH), interpolation=cv2.INTER_LINEAR)
-        resized_image = resized_image.astype('float32')
-        resized_image = resized_image / 255.
-
-        mean = np.array([0.485, 0.456, 0.406])
-        std = np.array([0.229, 0.224, 0.225])
-        resized_image = (
-            resized_image - mean[None, None, ...]) / std[None, None, ...]
-        resized_image = resized_image.transpose((2, 0, 1))
-        resized_image = resized_image.astype('float32')
-
-        return resized_image
-
-    def norm_img_can(self, img, image_shape):
-
-        img = cv2.cvtColor(
-            img, cv2.COLOR_BGR2GRAY)  # CAN only predict gray scale image
-
-        if self.inverse:
-            img = 255 - img
-
-        if self.rec_image_shape[0] == 1:
-            h, w = img.shape
-            _, imgH, imgW = self.rec_image_shape
-            if h < imgH or w < imgW:
-                padding_h = max(imgH - h, 0)
-                padding_w = max(imgW - w, 0)
-                img_padded = np.pad(img, ((0, padding_h), (0, padding_w)),
-                                    'constant',
-                                    constant_values=(255))
-                img = img_padded
-
-        img = np.expand_dims(img, 0) / 255.0  # h,w,c -> c,h,w
-        img = img.astype('float32')
-
+POSTPROCESS_CONFIG = {
+        'CTCLabelDecode':{
+            "character_dict_path": CHARACTER_DICT_PATH,
+            "use_space_char": True
+            }   
+        }
+class TextRecognizer:
+    def __init__(self, args) -> None:
+        self.model, self.framework = setup_model(args)
+        self.preprocess_funct = []
+        for item in PRE_PROCESS_CONFIG:
+            for key in item:
+                pclass = getattr(utils.operators, key)
+                p = pclass(**item[key])
+                self.preprocess_funct.append(p)
+
+        self.ctc_postprocess = CTCLabelDecode(**POSTPROCESS_CONFIG['CTCLabelDecode'])
+
+    def preprocess(self, img):
+        for p in self.preprocess_funct:
+            img = p(img)
+
+        if self.framework == 'onnx':
+            image_input = img['image']
+            image_input = image_input.reshape(1, *image_input.shape)
+            image_input = image_input.transpose(0, 3, 1, 2)
+            img['image'] = image_input
         return img
 
-    def release_rknn(self):
-        self.predictor.release()
-
-    def __call__(self, img_list):
-        img_num = len(img_list)
-        # Calculate the aspect ratio of all text bars
-        width_list = []
-        for img in img_list:
-            width_list.append(img.shape[1] / float(img.shape[0]))
-        # Sorting can speed up the recognition process
-        indices = np.argsort(np.array(width_list))
-        rec_res = [['', 0.0]] * img_num
-        batch_num = self.rec_batch_num
-        st = time.time()
-        if self.benchmark:
-            self.autolog.times.start()
-        for beg_img_no in range(0, img_num, batch_num):
-            end_img_no = min(img_num, beg_img_no + batch_num)
-            norm_img_batch = []
-            if self.rec_algorithm == "SRN":
-                encoder_word_pos_list = []
-                gsrm_word_pos_list = []
-                gsrm_slf_attn_bias1_list = []
-                gsrm_slf_attn_bias2_list = []
-            if self.rec_algorithm == "SAR":
-                valid_ratios = []
-            imgC, imgH, imgW = self.rec_image_shape[:3]
-            max_wh_ratio = imgW / imgH
-            # max_wh_ratio = 0
-            for ino in range(beg_img_no, end_img_no):
-                h, w = img_list[indices[ino]].shape[0:2]
-                wh_ratio = w * 1.0 / h
-                max_wh_ratio = max(max_wh_ratio, wh_ratio)
-            for ino in range(beg_img_no, end_img_no):
-                if self.rec_algorithm == "SAR":
-                    norm_img, _, _, valid_ratio = self.resize_norm_img_sar(
-                        img_list[indices[ino]], self.rec_image_shape)
-                    norm_img = norm_img[np.newaxis, :]
-                    valid_ratio = np.expand_dims(valid_ratio, axis=0)
-                    valid_ratios.append(valid_ratio)
-                    norm_img_batch.append(norm_img)
-                elif self.rec_algorithm == "SRN":
-                    norm_img = self.process_image_srn(
-                        img_list[indices[ino]], self.rec_image_shape, 8, 25)
-                    encoder_word_pos_list.append(norm_img[1])
-                    gsrm_word_pos_list.append(norm_img[2])
-                    gsrm_slf_attn_bias1_list.append(norm_img[3])
-                    gsrm_slf_attn_bias2_list.append(norm_img[4])
-                    norm_img_batch.append(norm_img[0])
-                elif self.rec_algorithm in ["SVTR", "SATRN"]:
-                    norm_img = self.resize_norm_img_svtr(img_list[indices[ino]],
-                                                         self.rec_image_shape)
-                    norm_img = norm_img[np.newaxis, :]
-                    norm_img_batch.append(norm_img)
-                elif self.rec_algorithm in ["VisionLAN", "PREN"]:
-                    norm_img = self.resize_norm_img_vl(img_list[indices[ino]],
-                                                       self.rec_image_shape)
-                    norm_img = norm_img[np.newaxis, :]
-                    norm_img_batch.append(norm_img)
-                elif self.rec_algorithm == 'SPIN':
-                    norm_img = self.resize_norm_img_spin(img_list[indices[ino]])
-                    norm_img = norm_img[np.newaxis, :]
-                    norm_img_batch.append(norm_img)
-                elif self.rec_algorithm == "ABINet":
-                    norm_img = self.resize_norm_img_abinet(
-                        img_list[indices[ino]], self.rec_image_shape)
-                    norm_img = norm_img[np.newaxis, :]
-                    norm_img_batch.append(norm_img)
-                elif self.rec_algorithm == "RobustScanner":
-                    norm_img, _, _, valid_ratio = self.resize_norm_img_sar(
-                        img_list[indices[ino]],
-                        self.rec_image_shape,
-                        width_downsample_ratio=0.25)
-                    norm_img = norm_img[np.newaxis, :]
-                    valid_ratio = np.expand_dims(valid_ratio, axis=0)
-                    valid_ratios = []
-                    valid_ratios.append(valid_ratio)
-                    norm_img_batch.append(norm_img)
-                    word_positions_list = []
-                    word_positions = np.array(range(0, 40)).astype('int64')
-                    word_positions = np.expand_dims(word_positions, axis=0)
-                    word_positions_list.append(word_positions)
-                elif self.rec_algorithm == "CAN":
-                    norm_img = self.norm_img_can(img_list[indices[ino]],
-                                                 max_wh_ratio)
-                    norm_img = norm_img[np.newaxis, :]
-                    norm_img_batch.append(norm_img)
-                    norm_image_mask = np.ones(norm_img.shape, dtype='float32')
-                    word_label = np.ones([1, 36], dtype='int64')
-                    norm_img_mask_batch = []
-                    word_label_list = []
-                    norm_img_mask_batch.append(norm_image_mask)
-                    word_label_list.append(word_label)
-                else:
-                    norm_img = self.resize_norm_img(img_list[indices[ino]],
-                                                    max_wh_ratio)
-                    norm_img = norm_img[np.newaxis, :]
-                    norm_img_batch.append(norm_img)
-            norm_img_batch = np.concatenate(norm_img_batch)
-            norm_img_batch = norm_img_batch.copy()
-            if self.benchmark:
-                self.autolog.times.stamp()
-
-            if self.rec_algorithm == "SRN":
-                encoder_word_pos_list = np.concatenate(encoder_word_pos_list)
-                gsrm_word_pos_list = np.concatenate(gsrm_word_pos_list)
-                gsrm_slf_attn_bias1_list = np.concatenate(
-                    gsrm_slf_attn_bias1_list)
-                gsrm_slf_attn_bias2_list = np.concatenate(
-                    gsrm_slf_attn_bias2_list)
-
-                inputs = [
-                    norm_img_batch,
-                    encoder_word_pos_list,
-                    gsrm_word_pos_list,
-                    gsrm_slf_attn_bias1_list,
-                    gsrm_slf_attn_bias2_list,
-                ]
-                if self.use_onnx:
-                    input_dict = {}
-                    input_dict[self.input_tensor.name] = norm_img_batch
-                    outputs = self.predictor.run(self.output_tensors,
-                                                 input_dict)
-                    preds = {"predict": outputs[2]}
-                else:
-                    input_names = self.predictor.get_input_names()
-                    for i in range(len(input_names)):
-                        input_tensor = self.predictor.get_input_handle(
-                            input_names[i])
-                        input_tensor.copy_from_cpu(inputs[i])
-                    self.predictor.run()
-                    outputs = []
-                    for output_tensor in self.output_tensors:
-                        output = output_tensor.copy_to_cpu()
-                        outputs.append(output)
-                    if self.benchmark:
-                        self.autolog.times.stamp()
-                    preds = {"predict": outputs[2]}
-            elif self.rec_algorithm == "SAR":
-                valid_ratios = np.concatenate(valid_ratios)
-                inputs = [
-                    norm_img_batch,
-                    np.array(
-                        [valid_ratios], dtype=np.float32),
-                ]
-                if self.use_onnx:
-                    input_dict = {}
-                    input_dict[self.input_tensor.name] = norm_img_batch
-                    outputs = self.predictor.run(self.output_tensors,
-                                                 input_dict)
-                    preds = outputs[0]
-                else:
-                    input_names = self.predictor.get_input_names()
-                    for i in range(len(input_names)):
-                        input_tensor = self.predictor.get_input_handle(
-                            input_names[i])
-                        input_tensor.copy_from_cpu(inputs[i])
-                    self.predictor.run()
-                    outputs = []
-                    for output_tensor in self.output_tensors:
-                        output = output_tensor.copy_to_cpu()
-                        outputs.append(output)
-                    if self.benchmark:
-                        self.autolog.times.stamp()
-                    preds = outputs[0]
-            elif self.rec_algorithm == "RobustScanner":
-                valid_ratios = np.concatenate(valid_ratios)
-                word_positions_list = np.concatenate(word_positions_list)
-                inputs = [norm_img_batch, valid_ratios, word_positions_list]
-
-                if self.use_onnx:
-                    input_dict = {}
-                    input_dict[self.input_tensor.name] = norm_img_batch
-                    outputs = self.predictor.run(self.output_tensors,
-                                                 input_dict)
-                    preds = outputs[0]
-                else:
-                    input_names = self.predictor.get_input_names()
-                    for i in range(len(input_names)):
-                        input_tensor = self.predictor.get_input_handle(
-                            input_names[i])
-                        input_tensor.copy_from_cpu(inputs[i])
-                    self.predictor.run()
-                    outputs = []
-                    for output_tensor in self.output_tensors:
-                        output = output_tensor.copy_to_cpu()
-                        outputs.append(output)
-                    if self.benchmark:
-                        self.autolog.times.stamp()
-                    preds = outputs[0]
-            elif self.rec_algorithm == "CAN":
-                norm_img_mask_batch = np.concatenate(norm_img_mask_batch)
-                word_label_list = np.concatenate(word_label_list)
-                inputs = [norm_img_batch, norm_img_mask_batch, word_label_list]
-                if self.use_onnx:
-                    input_dict = {}
-                    input_dict[self.input_tensor.name] = norm_img_batch
-                    outputs = self.predictor.run(self.output_tensors,
-                                                 input_dict)
-                    preds = outputs
-                else:
-                    input_names = self.predictor.get_input_names()
-                    input_tensor = []
-                    for i in range(len(input_names)):
-                        input_tensor_i = self.predictor.get_input_handle(
-                            input_names[i])
-                        input_tensor_i.copy_from_cpu(inputs[i])
-                        input_tensor.append(input_tensor_i)
-                    self.input_tensor = input_tensor
-                    self.predictor.run()
-                    outputs = []
-                    for output_tensor in self.output_tensors:
-                        output = output_tensor.copy_to_cpu()
-                        outputs.append(output)
-                    if self.benchmark:
-                        self.autolog.times.stamp()
-                    preds = outputs
-            else:
-                if self.use_onnx:
-                    input_dict = {}
-                    preds = []
-                    for idx in range(norm_img_batch.shape[0]):
-                        input_dict[self.input_tensor.name] = norm_img_batch[idx:idx+1]
-                        output = self.predictor.run(self.output_tensors,
-                                                    input_dict)
-                        preds.append(output[0])
-                    preds = np.concatenate(preds)
-                elif self.use_rknn:
-                    preds = []
-                    for idx in range(norm_img_batch.shape[0]):
-                        img = norm_img_batch[idx:idx+1]
-                        output = self.predictor.inference(inputs=[img], data_format=['nchw'])
-                        preds.append(output[0])
-                    preds = np.concatenate(preds)
-                else:
-                    self.input_tensor.copy_from_cpu(norm_img_batch)
-                    self.predictor.run()
-                    outputs = []
-                    for output_tensor in self.output_tensors:
-                        output = output_tensor.copy_to_cpu()
-                        outputs.append(output)
-                    if self.benchmark:
-                        self.autolog.times.stamp()
-                    if len(outputs) != 1:
-                        preds = outputs
-                    else:
-                        preds = outputs[0]
-            rec_result = self.postprocess_op(preds)
-            for rno in range(len(rec_result)):
-                rec_res[indices[beg_img_no + rno]] = rec_result[rno]
-            if self.benchmark:
-                self.autolog.times.end(stamp=True)
-        return rec_res, time.time() - st
-
-
-def main(args):
-    image_file_list = get_image_file_list(args.image_dir)
-    text_recognizer = TextRecognizer(args)
-    valid_image_file_list = []
-    img_list = []
-
-    logger.info(
-        "In PP-OCRv3, rec_image_shape parameter defaults to '3, 48, 320', "
-        "if you are using recognition model with PP-OCRv2 or an older version, please set --rec_image_shape='3,32,320"
-    )
-    # warmup 2 times
-    if args.warmup:
-        img = np.random.uniform(0, 255, [48, 320, 3]).astype(np.uint8)
-        for i in range(2):
-            res = text_recognizer([img] * int(args.rec_batch_num))
-
-    for image_file in image_file_list:
-        img, flag, _ = check_and_read(image_file)
-        if not flag:
-            img = cv2.imread(image_file)
-        if img is None:
-            logger.info("error in loading image:{}".format(image_file))
-            continue
-        valid_image_file_list.append(image_file)
-        img_list.append(img)
-    try:
-        rec_res, _ = text_recognizer(img_list)
-
-    except Exception as E:
-        logger.info(traceback.format_exc())
-        logger.info(E)
-        exit()
-    for ino in range(len(img_list)):
-        logger.info("Predicts of {}:{}".format(valid_image_file_list[ino],
-                                               rec_res[ino]))
-    if args.benchmark:
-        text_recognizer.autolog.report()
-    if args.use_rknn:
-        text_recognizer.release_rknn()
-
-
-if __name__ == "__main__":
-    main(utility.parse_args())
+    def run(self, img):
+        model_input = self.preprocess({'image':img})
+        output = self.model.run([model_input['image']])
+        preds = output[0].astype(np.float32)
+        output = self.ctc_postprocess(preds)
+        return output
+
+def setup_model(args):
+    model_path = args.model_path
+    if model_path.endswith('.rknn'):
+        platform = 'rknn'
+        from py_utils.rknn_executor import RKNN_model_container 
+        model = RKNN_model_container(model_path, args.target, args.device_id)
+    elif model_path.endswith('onnx'):
+        platform = 'onnx'
+        from py_utils.onnx_executor import ONNX_model_container
+        model = ONNX_model_container(model_path)
+    else:
+        assert False, "{} is not rknn/onnx model".format(model_path)
+    print('Model-{} is {} model, starting val'.format(model_path, platform))
+    return model, platform
+
+def init_args():
+    parser = argparse.ArgumentParser(description='PPOCR-Rec Python Demo')
+    # basic params
+    parser.add_argument('--model_path', type=str, required= True, help='model path, could be .onnx or .rknn file')
+    parser.add_argument('--target', type=str, default='rk3566', help='target RKNPU platform')
+    parser.add_argument('--device_id', type=str, default=None, help='device id')
+    return parser
+
+if __name__ == '__main__':
+    # Init model
+    parser = init_args()
+    args =  parser.parse_args()
+    det_model = TextRecognizer(args)
+    
+    # Set inputs
+    img_path = '../model/test.png'
+    img = cv2.imread(img_path)
+    img = cv2.resize(img, (REC_INPUT_SHAPE[1], REC_INPUT_SHAPE[0]))
+
+    # Inference
+    output = det_model.run(img)
+
+    print(output)
diff --git a/examples/PPOCR/PPOCR-Rec/python/requirements.txt b/examples/PPOCR/PPOCR-Rec/python/requirements.txt
deleted file mode 100644
index 4c587ce..0000000
--- a/examples/PPOCR/PPOCR-Rec/python/requirements.txt
+++ /dev/null
@@ -1 +0,0 @@
-paddleocr==2.6
\ No newline at end of file
diff --git a/examples/PPOCR/PPOCR-Rec/python/utility.py b/examples/PPOCR/PPOCR-Rec/python/utility.py
deleted file mode 100644
index 0a71871..0000000
--- a/examples/PPOCR/PPOCR-Rec/python/utility.py
+++ /dev/null
@@ -1,777 +0,0 @@
-# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-import argparse
-import os
-import sys
-import platform
-import cv2
-import numpy as np
-import paddle
-from PIL import Image, ImageDraw, ImageFont
-import math
-from paddle import inference
-import time
-import random
-import logging
-
-
-def str2bool(v):
-    return v.lower() in ("true", "t", "1")
-
-
-def init_args():
-    parser = argparse.ArgumentParser()
-    # params for prediction engine
-    parser.add_argument("--use_gpu", type=str2bool, default=True)
-    parser.add_argument("--use_xpu", type=str2bool, default=False)
-    parser.add_argument("--use_npu", type=str2bool, default=False)
-    parser.add_argument("--ir_optim", type=str2bool, default=True)
-    parser.add_argument("--use_tensorrt", type=str2bool, default=False)
-    parser.add_argument("--min_subgraph_size", type=int, default=15)
-    parser.add_argument("--precision", type=str, default="fp32")
-    parser.add_argument("--gpu_mem", type=int, default=500)
-    parser.add_argument("--gpu_id", type=int, default=0)
-
-    # params for text detector
-    parser.add_argument("--image_dir", type=str)
-    parser.add_argument("--page_num", type=int, default=0)
-    parser.add_argument("--det_algorithm", type=str, default='DB')
-    parser.add_argument("--det_model_dir", type=str)
-    parser.add_argument("--det_limit_side_len", type=float, default=960)
-    parser.add_argument("--det_image_shape", type=int, nargs='+', default=[960, 960], help="[h, w]")
-    parser.add_argument("--det_limit_type", type=str, default='max')
-    parser.add_argument("--det_box_type", type=str, default='quad')
-
-    # DB parmas
-    parser.add_argument("--det_db_thresh", type=float, default=0.3)
-    parser.add_argument("--det_db_box_thresh", type=float, default=0.6)
-    parser.add_argument("--det_db_unclip_ratio", type=float, default=1.5)
-    parser.add_argument("--max_batch_size", type=int, default=10)
-    parser.add_argument("--use_dilation", type=str2bool, default=False)
-    parser.add_argument("--det_db_score_mode", type=str, default="fast")
-
-    # EAST parmas
-    parser.add_argument("--det_east_score_thresh", type=float, default=0.8)
-    parser.add_argument("--det_east_cover_thresh", type=float, default=0.1)
-    parser.add_argument("--det_east_nms_thresh", type=float, default=0.2)
-
-    # SAST parmas
-    parser.add_argument("--det_sast_score_thresh", type=float, default=0.5)
-    parser.add_argument("--det_sast_nms_thresh", type=float, default=0.2)
-
-    # PSE parmas
-    parser.add_argument("--det_pse_thresh", type=float, default=0)
-    parser.add_argument("--det_pse_box_thresh", type=float, default=0.85)
-    parser.add_argument("--det_pse_min_area", type=float, default=16)
-    parser.add_argument("--det_pse_scale", type=int, default=1)
-
-    # FCE parmas
-    parser.add_argument("--scales", type=list, default=[8, 16, 32])
-    parser.add_argument("--alpha", type=float, default=1.0)
-    parser.add_argument("--beta", type=float, default=1.0)
-    parser.add_argument("--fourier_degree", type=int, default=5)
-
-    # params for text recognizer
-    parser.add_argument("--rec_algorithm", type=str, default='SVTR_LCNet')
-    parser.add_argument("--rec_model_dir", type=str)
-    parser.add_argument("--rec_image_inverse", type=str2bool, default=True)
-    parser.add_argument("--rec_image_shape", type=str, default="3, 48, 320")
-    parser.add_argument("--rec_batch_num", type=int, default=6)
-    parser.add_argument("--max_text_length", type=int, default=25)
-    parser.add_argument(
-        "--rec_char_dict_path",
-        type=str,
-        default="./ppocr/utils/ppocr_keys_v1.txt")
-    parser.add_argument("--use_space_char", type=str2bool, default=True)
-    parser.add_argument(
-        "--vis_font_path", type=str, default="./doc/fonts/simfang.ttf")
-    parser.add_argument("--drop_score", type=float, default=0.5)
-
-    # params for e2e
-    parser.add_argument("--e2e_algorithm", type=str, default='PGNet')
-    parser.add_argument("--e2e_model_dir", type=str)
-    parser.add_argument("--e2e_limit_side_len", type=float, default=768)
-    parser.add_argument("--e2e_limit_type", type=str, default='max')
-
-    # PGNet parmas
-    parser.add_argument("--e2e_pgnet_score_thresh", type=float, default=0.5)
-    parser.add_argument(
-        "--e2e_char_dict_path", type=str, default="./ppocr/utils/ic15_dict.txt")
-    parser.add_argument("--e2e_pgnet_valid_set", type=str, default='totaltext')
-    parser.add_argument("--e2e_pgnet_mode", type=str, default='fast')
-
-    # params for text classifier
-    parser.add_argument("--use_angle_cls", type=str2bool, default=False)
-    parser.add_argument("--cls_model_dir", type=str)
-    parser.add_argument("--cls_image_shape", type=str, default="3, 48, 192")
-    parser.add_argument("--label_list", type=list, default=['0', '180'])
-    parser.add_argument("--cls_batch_num", type=int, default=6)
-    parser.add_argument("--cls_thresh", type=float, default=0.9)
-
-    parser.add_argument("--enable_mkldnn", type=str2bool, default=False)
-    parser.add_argument("--cpu_threads", type=int, default=10)
-    parser.add_argument("--use_pdserving", type=str2bool, default=False)
-    parser.add_argument("--warmup", type=str2bool, default=False)
-
-    # SR parmas
-    parser.add_argument("--sr_model_dir", type=str)
-    parser.add_argument("--sr_image_shape", type=str, default="3, 32, 128")
-    parser.add_argument("--sr_batch_num", type=int, default=1)
-
-    #
-    parser.add_argument(
-        "--draw_img_save_dir", type=str, default="./inference_results")
-    parser.add_argument("--save_crop_res", type=str2bool, default=False)
-    parser.add_argument("--crop_res_save_dir", type=str, default="./output")
-
-    # multi-process
-    parser.add_argument("--use_mp", type=str2bool, default=False)
-    parser.add_argument("--total_process_num", type=int, default=1)
-    parser.add_argument("--process_id", type=int, default=0)
-
-    parser.add_argument("--benchmark", type=str2bool, default=False)
-    parser.add_argument("--save_log_path", type=str, default="./log_output/")
-
-    parser.add_argument("--show_log", type=str2bool, default=True)
-    parser.add_argument("--use_onnx", type=str2bool, default=False)
-    parser.add_argument("--use_rknn", type=str2bool, default=False)
-    parser.add_argument("--platform", type=str, default="rk3568")
-    return parser
-
-
-def parse_args():
-    parser = init_args()
-    return parser.parse_args()
-
-
-def create_predictor(args, mode, logger):
-    if mode == "det":
-        model_dir = args.det_model_dir
-    elif mode == 'cls':
-        model_dir = args.cls_model_dir
-    elif mode == 'rec':
-        model_dir = args.rec_model_dir
-    elif mode == 'table':
-        model_dir = args.table_model_dir
-    elif mode == 'ser':
-        model_dir = args.ser_model_dir
-    elif mode == 're':
-        model_dir = args.re_model_dir
-    elif mode == "sr":
-        model_dir = args.sr_model_dir
-    elif mode == 'layout':
-        model_dir = args.layout_model_dir
-    else:
-        model_dir = args.e2e_model_dir
-
-    if model_dir is None:
-        logger.info("not find {} model file path {}".format(mode, model_dir))
-        sys.exit(0)
-    if args.use_onnx:
-        import onnxruntime as ort
-        model_file_path = model_dir
-        if not os.path.exists(model_file_path):
-            raise ValueError("not find model file path {}".format(
-                model_file_path))
-        sess = ort.InferenceSession(model_file_path)
-        return sess, sess.get_inputs()[0], None, None
-    elif args.use_rknn:
-        from rknn.api import RKNN
-        rknn = RKNN()
-        print('--> Load rknn model')
-        model_file_path = model_dir
-        if not os.path.exists(model_file_path):
-            raise ValueError("not find model file path {}".format(
-                model_file_path))
-        ret = rknn.load_rknn(model_file_path)
-        if ret != 0:
-            print('Load rknn model failed!')
-            exit(ret)
-        print('done')
-        print('--> Init runtime environment')
-        # ret = rknn.init_runtime()
-        ret = rknn.init_runtime(args.platform)
-        if ret != 0:
-            print('Init runtime environment failed!')
-            exit(ret)
-        print('done')
-        return rknn, None, None, None
-    else:
-        file_names = ['model', 'inference']
-        for file_name in file_names:
-            model_file_path = '{}/{}.pdmodel'.format(model_dir, file_name)
-            params_file_path = '{}/{}.pdiparams'.format(model_dir, file_name)
-            if os.path.exists(model_file_path) and os.path.exists(
-                    params_file_path):
-                break
-        if not os.path.exists(model_file_path):
-            raise ValueError(
-                "not find model.pdmodel or inference.pdmodel in {}".format(
-                    model_dir))
-        if not os.path.exists(params_file_path):
-            raise ValueError(
-                "not find model.pdiparams or inference.pdiparams in {}".format(
-                    model_dir))
-
-        config = inference.Config(model_file_path, params_file_path)
-
-        if hasattr(args, 'precision'):
-            if args.precision == "fp16" and args.use_tensorrt:
-                precision = inference.PrecisionType.Half
-            elif args.precision == "int8":
-                precision = inference.PrecisionType.Int8
-            else:
-                precision = inference.PrecisionType.Float32
-        else:
-            precision = inference.PrecisionType.Float32
-
-        if args.use_gpu:
-            gpu_id = get_infer_gpuid()
-            if gpu_id is None:
-                logger.warning(
-                    "GPU is not found in current device by nvidia-smi. Please check your device or ignore it if run on jetson."
-                )
-            config.enable_use_gpu(args.gpu_mem, args.gpu_id)
-            if args.use_tensorrt:
-                config.enable_tensorrt_engine(
-                    workspace_size=1 << 30,
-                    precision_mode=precision,
-                    max_batch_size=args.max_batch_size,
-                    min_subgraph_size=args.
-                    min_subgraph_size,  # skip the minmum trt subgraph
-                    use_calib_mode=False)
-
-                # collect shape
-                trt_shape_f = os.path.join(model_dir,
-                                           f"{mode}_trt_dynamic_shape.txt")
-
-                if not os.path.exists(trt_shape_f):
-                    config.collect_shape_range_info(trt_shape_f)
-                    logger.info(
-                        f"collect dynamic shape info into : {trt_shape_f}")
-                try:
-                    config.enable_tuned_tensorrt_dynamic_shape(trt_shape_f,
-                                                               True)
-                except Exception as E:
-                    logger.info(E)
-                    logger.info("Please keep your paddlepaddle-gpu >= 2.3.0!")
-
-        elif args.use_npu:
-            config.enable_custom_device("npu")
-        elif args.use_xpu:
-            config.enable_xpu(10 * 1024 * 1024)
-        else:
-            config.disable_gpu()
-            if args.enable_mkldnn:
-                # cache 10 different shapes for mkldnn to avoid memory leak
-                config.set_mkldnn_cache_capacity(10)
-                config.enable_mkldnn()
-                if args.precision == "fp16":
-                    config.enable_mkldnn_bfloat16()
-                if hasattr(args, "cpu_threads"):
-                    config.set_cpu_math_library_num_threads(args.cpu_threads)
-                else:
-                    # default cpu threads as 10
-                    config.set_cpu_math_library_num_threads(10)
-        # enable memory optim
-        config.enable_memory_optim()
-        config.disable_glog_info()
-        config.delete_pass("conv_transpose_eltwiseadd_bn_fuse_pass")
-        config.delete_pass("matmul_transpose_reshape_fuse_pass")
-        if mode == 're':
-            config.delete_pass("simplify_with_basic_ops_pass")
-        if mode == 'table':
-            config.delete_pass("fc_fuse_pass")  # not supported for table
-        config.switch_use_feed_fetch_ops(False)
-        config.switch_ir_optim(True)
-
-        # create predictor
-        predictor = inference.create_predictor(config)
-        input_names = predictor.get_input_names()
-        if mode in ['ser', 're']:
-            input_tensor = []
-            for name in input_names:
-                input_tensor.append(predictor.get_input_handle(name))
-        else:
-            for name in input_names:
-                input_tensor = predictor.get_input_handle(name)
-        output_tensors = get_output_tensors(args, mode, predictor)
-        return predictor, input_tensor, output_tensors, config
-
-
-def get_output_tensors(args, mode, predictor):
-    output_names = predictor.get_output_names()
-    output_tensors = []
-    if mode == "rec" and args.rec_algorithm in [
-            "CRNN", "SVTR_LCNet", "SVTR_HGNet"
-    ]:
-        output_name = 'softmax_0.tmp_0'
-        if output_name in output_names:
-            return [predictor.get_output_handle(output_name)]
-        else:
-            for output_name in output_names:
-                output_tensor = predictor.get_output_handle(output_name)
-                output_tensors.append(output_tensor)
-    else:
-        for output_name in output_names:
-            output_tensor = predictor.get_output_handle(output_name)
-            output_tensors.append(output_tensor)
-    return output_tensors
-
-
-def get_infer_gpuid():
-    sysstr = platform.system()
-    if sysstr == "Windows":
-        return 0
-
-    if not paddle.device.is_compiled_with_rocm:
-        cmd = "env | grep CUDA_VISIBLE_DEVICES"
-    else:
-        cmd = "env | grep HIP_VISIBLE_DEVICES"
-    env_cuda = os.popen(cmd).readlines()
-    if len(env_cuda) == 0:
-        return 0
-    else:
-        gpu_id = env_cuda[0].strip().split("=")[1]
-        return int(gpu_id[0])
-
-
-def draw_e2e_res(dt_boxes, strs, img_path):
-    src_im = cv2.imread(img_path)
-    for box, str in zip(dt_boxes, strs):
-        box = box.astype(np.int32).reshape((-1, 1, 2))
-        cv2.polylines(src_im, [box], True, color=(255, 255, 0), thickness=2)
-        cv2.putText(
-            src_im,
-            str,
-            org=(int(box[0, 0, 0]), int(box[0, 0, 1])),
-            fontFace=cv2.FONT_HERSHEY_COMPLEX,
-            fontScale=0.7,
-            color=(0, 255, 0),
-            thickness=1)
-    return src_im
-
-
-def draw_text_det_res(dt_boxes, img):
-    for box in dt_boxes:
-        box = np.array(box).astype(np.int32).reshape(-1, 2)
-        cv2.polylines(img, [box], True, color=(255, 255, 0), thickness=2)
-    return img
-
-
-def resize_img(img, input_size=600):
-    """
-    resize img and limit the longest side of the image to input_size
-    """
-    img = np.array(img)
-    im_shape = img.shape
-    im_size_max = np.max(im_shape[0:2])
-    im_scale = float(input_size) / float(im_size_max)
-    img = cv2.resize(img, None, None, fx=im_scale, fy=im_scale)
-    return img
-
-
-def draw_ocr(image,
-             boxes,
-             txts=None,
-             scores=None,
-             drop_score=0.5,
-             font_path="./doc/fonts/simfang.ttf"):
-    """
-    Visualize the results of OCR detection and recognition
-    args:
-        image(Image|array): RGB image
-        boxes(list): boxes with shape(N, 4, 2)
-        txts(list): the texts
-        scores(list): txxs corresponding scores
-        drop_score(float): only scores greater than drop_threshold will be visualized
-        font_path: the path of font which is used to draw text
-    return(array):
-        the visualized img
-    """
-    if scores is None:
-        scores = [1] * len(boxes)
-    box_num = len(boxes)
-    for i in range(box_num):
-        if scores is not None and (scores[i] < drop_score or
-                                   math.isnan(scores[i])):
-            continue
-        box = np.reshape(np.array(boxes[i]), [-1, 1, 2]).astype(np.int64)
-        image = cv2.polylines(np.array(image), [box], True, (255, 0, 0), 2)
-    if txts is not None:
-        img = np.array(resize_img(image, input_size=600))
-        txt_img = text_visual(
-            txts,
-            scores,
-            img_h=img.shape[0],
-            img_w=600,
-            threshold=drop_score,
-            font_path=font_path)
-        img = np.concatenate([np.array(img), np.array(txt_img)], axis=1)
-        return img
-    return image
-
-
-def draw_ocr_box_txt(image,
-                     boxes,
-                     txts=None,
-                     scores=None,
-                     drop_score=0.5,
-                     font_path="./doc/fonts/simfang.ttf"):
-    h, w = image.height, image.width
-    img_left = image.copy()
-    img_right = np.ones((h, w, 3), dtype=np.uint8) * 255
-    random.seed(0)
-
-    draw_left = ImageDraw.Draw(img_left)
-    if txts is None or len(txts) != len(boxes):
-        txts = [None] * len(boxes)
-    for idx, (box, txt) in enumerate(zip(boxes, txts)):
-        if scores is not None and scores[idx] < drop_score:
-            continue
-        color = (random.randint(0, 255), random.randint(0, 255),
-                 random.randint(0, 255))
-        draw_left.polygon(box, fill=color)
-        img_right_text = draw_box_txt_fine((w, h), box, txt, font_path)
-        pts = np.array(box, np.int32).reshape((-1, 1, 2))
-        cv2.polylines(img_right_text, [pts], True, color, 1)
-        img_right = cv2.bitwise_and(img_right, img_right_text)
-    img_left = Image.blend(image, img_left, 0.5)
-    img_show = Image.new('RGB', (w * 2, h), (255, 255, 255))
-    img_show.paste(img_left, (0, 0, w, h))
-    img_show.paste(Image.fromarray(img_right), (w, 0, w * 2, h))
-    return np.array(img_show)
-
-
-def draw_box_txt_fine(img_size, box, txt, font_path="./doc/fonts/simfang.ttf"):
-    box_height = int(
-        math.sqrt((box[0][0] - box[3][0])**2 + (box[0][1] - box[3][1])**2))
-    box_width = int(
-        math.sqrt((box[0][0] - box[1][0])**2 + (box[0][1] - box[1][1])**2))
-
-    if box_height > 2 * box_width and box_height > 30:
-        img_text = Image.new('RGB', (box_height, box_width), (255, 255, 255))
-        draw_text = ImageDraw.Draw(img_text)
-        if txt:
-            font = create_font(txt, (box_height, box_width), font_path)
-            draw_text.text([0, 0], txt, fill=(0, 0, 0), font=font)
-        img_text = img_text.transpose(Image.ROTATE_270)
-    else:
-        img_text = Image.new('RGB', (box_width, box_height), (255, 255, 255))
-        draw_text = ImageDraw.Draw(img_text)
-        if txt:
-            font = create_font(txt, (box_width, box_height), font_path)
-            draw_text.text([0, 0], txt, fill=(0, 0, 0), font=font)
-
-    pts1 = np.float32(
-        [[0, 0], [box_width, 0], [box_width, box_height], [0, box_height]])
-    pts2 = np.array(box, dtype=np.float32)
-    M = cv2.getPerspectiveTransform(pts1, pts2)
-
-    img_text = np.array(img_text, dtype=np.uint8)
-    img_right_text = cv2.warpPerspective(
-        img_text,
-        M,
-        img_size,
-        flags=cv2.INTER_NEAREST,
-        borderMode=cv2.BORDER_CONSTANT,
-        borderValue=(255, 255, 255))
-    return img_right_text
-
-
-def create_font(txt, sz, font_path="./doc/fonts/simfang.ttf"):
-    font_size = int(sz[1] * 0.99)
-    font = ImageFont.truetype(font_path, font_size, encoding="utf-8")
-    length = font.getlength(txt)
-    if length > sz[0]:
-        font_size = int(font_size * sz[0] / length)
-        font = ImageFont.truetype(font_path, font_size, encoding="utf-8")
-    return font
-
-
-def str_count(s):
-    """
-    Count the number of Chinese characters,
-    a single English character and a single number
-    equal to half the length of Chinese characters.
-    args:
-        s(string): the input of string
-    return(int):
-        the number of Chinese characters
-    """
-    import string
-    count_zh = count_pu = 0
-    s_len = len(s)
-    en_dg_count = 0
-    for c in s:
-        if c in string.ascii_letters or c.isdigit() or c.isspace():
-            en_dg_count += 1
-        elif c.isalpha():
-            count_zh += 1
-        else:
-            count_pu += 1
-    return s_len - math.ceil(en_dg_count / 2)
-
-
-def text_visual(texts,
-                scores,
-                img_h=400,
-                img_w=600,
-                threshold=0.,
-                font_path="./doc/simfang.ttf"):
-    """
-    create new blank img and draw txt on it
-    args:
-        texts(list): the text will be draw
-        scores(list|None): corresponding score of each txt
-        img_h(int): the height of blank img
-        img_w(int): the width of blank img
-        font_path: the path of font which is used to draw text
-    return(array):
-    """
-    if scores is not None:
-        assert len(texts) == len(
-            scores), "The number of txts and corresponding scores must match"
-
-    def create_blank_img():
-        blank_img = np.ones(shape=[img_h, img_w], dtype=np.int8) * 255
-        blank_img[:, img_w - 1:] = 0
-        blank_img = Image.fromarray(blank_img).convert("RGB")
-        draw_txt = ImageDraw.Draw(blank_img)
-        return blank_img, draw_txt
-
-    blank_img, draw_txt = create_blank_img()
-
-    font_size = 20
-    txt_color = (0, 0, 0)
-    font = ImageFont.truetype(font_path, font_size, encoding="utf-8")
-
-    gap = font_size + 5
-    txt_img_list = []
-    count, index = 1, 0
-    for idx, txt in enumerate(texts):
-        index += 1
-        if scores[idx] < threshold or math.isnan(scores[idx]):
-            index -= 1
-            continue
-        first_line = True
-        while str_count(txt) >= img_w // font_size - 4:
-            tmp = txt
-            txt = tmp[:img_w // font_size - 4]
-            if first_line:
-                new_txt = str(index) + ': ' + txt
-                first_line = False
-            else:
-                new_txt = '    ' + txt
-            draw_txt.text((0, gap * count), new_txt, txt_color, font=font)
-            txt = tmp[img_w // font_size - 4:]
-            if count >= img_h // gap - 1:
-                txt_img_list.append(np.array(blank_img))
-                blank_img, draw_txt = create_blank_img()
-                count = 0
-            count += 1
-        if first_line:
-            new_txt = str(index) + ': ' + txt + '   ' + '%.3f' % (scores[idx])
-        else:
-            new_txt = "  " + txt + "  " + '%.3f' % (scores[idx])
-        draw_txt.text((0, gap * count), new_txt, txt_color, font=font)
-        # whether add new blank img or not
-        if count >= img_h // gap - 1 and idx + 1 < len(texts):
-            txt_img_list.append(np.array(blank_img))
-            blank_img, draw_txt = create_blank_img()
-            count = 0
-        count += 1
-    txt_img_list.append(np.array(blank_img))
-    if len(txt_img_list) == 1:
-        blank_img = np.array(txt_img_list[0])
-    else:
-        blank_img = np.concatenate(txt_img_list, axis=1)
-    return np.array(blank_img)
-
-
-def base64_to_cv2(b64str):
-    import base64
-    data = base64.b64decode(b64str.encode('utf8'))
-    data = np.frombuffer(data, np.uint8)
-    data = cv2.imdecode(data, cv2.IMREAD_COLOR)
-    return data
-
-
-def draw_boxes(image, boxes, scores=None, drop_score=0.5):
-    if scores is None:
-        scores = [1] * len(boxes)
-    for (box, score) in zip(boxes, scores):
-        if score < drop_score:
-            continue
-        box = np.reshape(np.array(box), [-1, 1, 2]).astype(np.int64)
-        image = cv2.polylines(np.array(image), [box], True, (255, 0, 0), 2)
-    return image
-
-
-def get_rotate_crop_image(img, points):
-    '''
-    img_height, img_width = img.shape[0:2]
-    left = int(np.min(points[:, 0]))
-    right = int(np.max(points[:, 0]))
-    top = int(np.min(points[:, 1]))
-    bottom = int(np.max(points[:, 1]))
-    img_crop = img[top:bottom, left:right, :].copy()
-    points[:, 0] = points[:, 0] - left
-    points[:, 1] = points[:, 1] - top
-    '''
-    assert len(points) == 4, "shape of points must be 4*2"
-    img_crop_width = int(
-        max(
-            np.linalg.norm(points[0] - points[1]),
-            np.linalg.norm(points[2] - points[3])))
-    img_crop_height = int(
-        max(
-            np.linalg.norm(points[0] - points[3]),
-            np.linalg.norm(points[1] - points[2])))
-    pts_std = np.float32([[0, 0], [img_crop_width, 0],
-                          [img_crop_width, img_crop_height],
-                          [0, img_crop_height]])
-    M = cv2.getPerspectiveTransform(points, pts_std)
-    dst_img = cv2.warpPerspective(
-        img,
-        M, (img_crop_width, img_crop_height),
-        borderMode=cv2.BORDER_REPLICATE,
-        flags=cv2.INTER_CUBIC)
-    dst_img_height, dst_img_width = dst_img.shape[0:2]
-    if dst_img_height * 1.0 / dst_img_width >= 1.5:
-        dst_img = np.rot90(dst_img)
-    return dst_img
-
-
-def get_minarea_rect_crop(img, points):
-    bounding_box = cv2.minAreaRect(np.array(points).astype(np.int32))
-    points = sorted(list(cv2.boxPoints(bounding_box)), key=lambda x: x[0])
-
-    index_a, index_b, index_c, index_d = 0, 1, 2, 3
-    if points[1][1] > points[0][1]:
-        index_a = 0
-        index_d = 1
-    else:
-        index_a = 1
-        index_d = 0
-    if points[3][1] > points[2][1]:
-        index_b = 2
-        index_c = 3
-    else:
-        index_b = 3
-        index_c = 2
-
-    box = [points[index_a], points[index_b], points[index_c], points[index_d]]
-    crop_img = get_rotate_crop_image(img, np.array(box))
-    return crop_img, box
-
-
-def check_gpu(use_gpu):
-    if use_gpu and not paddle.is_compiled_with_cuda():
-        use_gpu = False
-    return use_gpu
-
-
-def _check_image_file(path):
-    img_end = {'jpg', 'bmp', 'png', 'jpeg', 'rgb', 'tif', 'tiff', 'gif', 'pdf'}
-    return any([path.lower().endswith(e) for e in img_end])
-
-
-def get_image_file_list(img_file):
-    imgs_lists = []
-    if img_file is None or not os.path.exists(img_file):
-        raise Exception("not found any img file in {}".format(img_file))
-
-    img_end = {'jpg', 'bmp', 'png', 'jpeg', 'rgb', 'tif', 'tiff', 'gif', 'pdf'}
-    if os.path.isfile(img_file) and _check_image_file(img_file):
-        imgs_lists.append(img_file)
-    elif os.path.isdir(img_file):
-        for single_file in os.listdir(img_file):
-            file_path = os.path.join(img_file, single_file)
-            if os.path.isfile(file_path) and _check_image_file(file_path):
-                imgs_lists.append(file_path)
-    if len(imgs_lists) == 0:
-        raise Exception("not found any img file in {}".format(img_file))
-    imgs_lists = sorted(imgs_lists)
-    return imgs_lists
-
-
-def check_and_read(img_path):
-    if os.path.basename(img_path)[-3:] in ['gif', 'GIF']:
-        gif = cv2.VideoCapture(img_path)
-        ret, frame = gif.read()
-        if not ret:
-            logger = logging.getLogger('ppocr')
-            logger.info("Cannot read {}. This gif image maybe corrupted.")
-            return None, False
-        if len(frame.shape) == 2 or frame.shape[-1] == 1:
-            frame = cv2.cvtColor(frame, cv2.COLOR_GRAY2RGB)
-        imgvalue = frame[:, :, ::-1]
-        return imgvalue, True, False
-    elif os.path.basename(img_path)[-3:] in ['pdf']:
-        import fitz
-        from PIL import Image
-        imgs = []
-        with fitz.open(img_path) as pdf:
-            for pg in range(0, pdf.pageCount):
-                page = pdf[pg]
-                mat = fitz.Matrix(2, 2)
-                pm = page.getPixmap(matrix=mat, alpha=False)
-
-                # if width or height > 2000 pixels, don't enlarge the image
-                if pm.width > 2000 or pm.height > 2000:
-                    pm = page.getPixmap(matrix=fitz.Matrix(1, 1), alpha=False)
-
-                img = Image.frombytes("RGB", [pm.width, pm.height], pm.samples)
-                img = cv2.cvtColor(np.array(img), cv2.COLOR_RGB2BGR)
-                imgs.append(img)
-            return imgs, False, True
-    return None, False, False
-
-
-def create_operators(op_param_list, global_config=None):
-    """
-    create operators based on the config
-
-    Args:
-        params(list): a dict list, used to create some operators
-    """
-    assert isinstance(op_param_list, list), ('operator config should be a list')
-    ops = []
-    for operator in op_param_list:
-        assert isinstance(operator,
-                          dict) and len(operator) == 1, "yaml format error"
-        op_name = list(operator)[0]
-        param = {} if operator[op_name] is None else operator[op_name]
-        if global_config is not None:
-            param.update(global_config)
-        op = eval(op_name)(**param)
-        ops.append(op)
-    return ops
-
-
-def transform(data, ops=None):
-    """ transform """
-    if ops is None:
-        ops = []
-    for op in ops:
-        data = op(data)
-        if data is None:
-            return None
-    return data
-
-
-if __name__ == '__main__':
-    pass
diff --git a/examples/PPOCR/PPOCR-Rec/python/utils/__init__.py b/examples/PPOCR/PPOCR-Rec/python/utils/__init__.py
new file mode 100644
index 0000000..e69de29
diff --git a/examples/PPOCR/PPOCR-Rec/python/utils/operators.py b/examples/PPOCR/PPOCR-Rec/python/utils/operators.py
new file mode 100644
index 0000000..f19c15f
--- /dev/null
+++ b/examples/PPOCR/PPOCR-Rec/python/utils/operators.py
@@ -0,0 +1,373 @@
+"""
+# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+from __future__ import unicode_literals
+
+import sys
+import six
+import cv2
+import numpy as np
+
+
+class DecodeImage(object):
+    """ decode image """
+
+    def __init__(self, img_mode='RGB', channel_first=False, **kwargs):
+        self.img_mode = img_mode
+        self.channel_first = channel_first
+
+    def __call__(self, data):
+        img = data['image']
+        if six.PY2:
+            assert type(img) is str and len(
+                img) > 0, "invalid input 'img' in DecodeImage"
+        else:
+            assert type(img) is bytes and len(
+                img) > 0, "invalid input 'img' in DecodeImage"
+        img = np.frombuffer(img, dtype='uint8')
+        img = cv2.imdecode(img, 1)
+        if img is None:
+            return None
+        if self.img_mode == 'GRAY':
+            img = cv2.cvtColor(img, cv2.COLOR_GRAY2BGR)
+        elif self.img_mode == 'RGB':
+            assert img.shape[2] == 3, 'invalid shape of image[%s]' % (img.shape)
+            img = img[:, :, ::-1]
+
+        if self.channel_first:
+            img = img.transpose((2, 0, 1))
+
+        data['image'] = img
+        return data
+
+
+class NRTRDecodeImage(object):
+    """ decode image """
+
+    def __init__(self, img_mode='RGB', channel_first=False, **kwargs):
+        self.img_mode = img_mode
+        self.channel_first = channel_first
+
+    def __call__(self, data):
+        img = data['image']
+        if six.PY2:
+            assert type(img) is str and len(
+                img) > 0, "invalid input 'img' in DecodeImage"
+        else:
+            assert type(img) is bytes and len(
+                img) > 0, "invalid input 'img' in DecodeImage"
+        img = np.frombuffer(img, dtype='uint8')
+
+        img = cv2.imdecode(img, 1)
+
+        if img is None:
+            return None
+        if self.img_mode == 'GRAY':
+            img = cv2.cvtColor(img, cv2.COLOR_GRAY2BGR)
+        elif self.img_mode == 'RGB':
+            assert img.shape[2] == 3, 'invalid shape of image[%s]' % (img.shape)
+            img = img[:, :, ::-1]
+        img = cv2.cvtColor(img,cv2.COLOR_BGR2GRAY)
+        if self.channel_first:
+            img = img.transpose((2, 0, 1))
+        data['image'] = img
+        return data
+
+class NormalizeImage(object):
+    """ normalize image such as substract mean, divide std
+    """
+
+    def __init__(self, scale=None, mean=None, std=None, order='chw', **kwargs):
+        if isinstance(scale, str):
+            scale = eval(scale)
+        self.scale = np.float32(scale if scale is not None else 1.0 / 255.0)
+        mean = mean if mean is not None else [0.485, 0.456, 0.406]
+        std = std if std is not None else [0.229, 0.224, 0.225]
+
+        shape = (3, 1, 1) if order == 'chw' else (1, 1, 3)
+        self.mean = np.array(mean).reshape(shape).astype('float32')
+        self.std = np.array(std).reshape(shape).astype('float32')
+
+    def __call__(self, data):
+        img = data['image']
+        from PIL import Image
+        if isinstance(img, Image.Image):
+            img = np.array(img)
+
+        assert isinstance(img,
+                          np.ndarray), "invalid input 'img' in NormalizeImage"
+        data['image'] = (
+                                img.astype('float32') * self.scale - self.mean) / self.std
+        return data
+
+
+class ToCHWImage(object):
+    """ convert hwc image to chw image
+    """
+
+    def __init__(self, **kwargs):
+        pass
+
+    def __call__(self, data):
+        img = data['image']
+        from PIL import Image
+        if isinstance(img, Image.Image):
+            img = np.array(img)
+        data['image'] = img.transpose((2, 0, 1))
+        return data
+
+
+class KeepKeys(object):
+    def __init__(self, keep_keys, **kwargs):
+        self.keep_keys = keep_keys
+
+    def __call__(self, data):
+        data_list = []
+        for key in self.keep_keys:
+            data_list.append(data[key])
+        return data_list
+
+
+class DetResizeForTest(object):
+    def __init__(self, **kwargs):
+        super(DetResizeForTest, self).__init__()
+        self.square_input = True
+        self.resize_type = 0
+        if 'image_shape' in kwargs:
+            self.image_shape = kwargs['image_shape']
+            self.resize_type = 1
+        elif 'limit_side_len' in kwargs:
+            self.limit_side_len = kwargs['limit_side_len']
+            self.limit_type = kwargs.get('limit_type', 'min')
+        elif 'resize_long' in kwargs:
+            self.resize_type = 2
+            self.resize_long = kwargs.get('resize_long', 960)
+        else:
+            self.limit_side_len = 736
+            self.limit_type = 'min'
+
+    def __call__(self, data):
+        img = data['image']
+        src_h, src_w, _ = img.shape
+
+        if self.resize_type == 0:
+            # img, shape = self.resize_image_type0(img)
+            img, [ratio_h, ratio_w] = self.resize_image_type0(img)
+        elif self.resize_type == 2:
+            img, [ratio_h, ratio_w] = self.resize_image_type2(img)
+        else:
+            # img, shape = self.resize_image_type1(img)
+            img, [ratio_h, ratio_w] = self.resize_image_type1(img)
+        
+
+
+        data['image'] = img
+        data['shape'] = np.array([src_h, src_w, ratio_h, ratio_w])
+        if len(data['shape'].shape) == 1:
+            data['shape'] = np.expand_dims(data['shape'], axis=0) 
+        return data
+
+    def resize_image_type1(self, img):
+        resize_h, resize_w = self.image_shape
+        ori_h, ori_w = img.shape[:2]  # (h, w, c)
+        ratio_h = float(resize_h) / ori_h
+        ratio_w = float(resize_w) / ori_w
+        img = cv2.resize(img, (int(resize_w), int(resize_h)))
+        # return img, np.array([ori_h, ori_w])
+        return img, [ratio_h, ratio_w]
+
+    def resize_image_type0(self, img):
+        """
+        resize image to a size multiple of 32 which is required by the network
+        args:
+            img(array): array with shape [h, w, c]
+        return(tuple):
+            img, (ratio_h, ratio_w)
+        """
+        limit_side_len = self.limit_side_len
+        h, w, c = img.shape
+
+        # limit the max side
+        if self.limit_type == 'max':
+            if max(h, w) > limit_side_len:
+                if h > w:
+                    ratio = float(limit_side_len) / h
+                else:
+                    ratio = float(limit_side_len) / w
+            else:
+                ratio = 1.
+        elif self.limit_type == 'min':
+            if min(h, w) < limit_side_len:
+                if h < w:
+                    ratio = float(limit_side_len) / h
+                else:
+                    ratio = float(limit_side_len) / w
+            else:
+                ratio = 1.
+        elif self.limit_type == 'resize_long':
+            ratio = float(limit_side_len) / max(h,w)
+        else:
+            raise Exception('not support limit type, image ')
+        resize_h = int(h * ratio)
+        resize_w = int(w * ratio)
+
+        resize_h = max(int(round(resize_h / 32) * 32), 32)
+        resize_w = max(int(round(resize_w / 32) * 32), 32)
+
+        try:
+            if int(resize_w) <= 0 or int(resize_h) <= 0:
+                return None, (None, None)
+            img = cv2.resize(img, (int(resize_w), int(resize_h)))
+        except:
+            print(img.shape, resize_w, resize_h)
+            sys.exit(0)
+        ratio_h = resize_h / float(h)
+        ratio_w = resize_w / float(w)
+        return img, [ratio_h, ratio_w]
+
+    def resize_image_type2(self, img):
+        h, w, _ = img.shape
+
+        resize_w = w
+        resize_h = h
+
+        if resize_h > resize_w:
+            ratio = float(self.resize_long) / resize_h
+        else:
+            ratio = float(self.resize_long) / resize_w
+
+        resize_h = int(resize_h * ratio)
+        resize_w = int(resize_w * ratio)
+
+        max_stride = 128
+        resize_h = (resize_h + max_stride - 1) // max_stride * max_stride
+        resize_w = (resize_w + max_stride - 1) // max_stride * max_stride
+        img = cv2.resize(img, (int(resize_w), int(resize_h)))
+        ratio_h = resize_h / float(h)
+        ratio_w = resize_w / float(w)
+
+        return img, [ratio_h, ratio_w]
+
+
+class E2EResizeForTest(object):
+    def __init__(self, **kwargs):
+        super(E2EResizeForTest, self).__init__()
+        self.max_side_len = kwargs['max_side_len']
+        self.valid_set = kwargs['valid_set']
+
+    def __call__(self, data):
+        img = data['image']
+        src_h, src_w, _ = img.shape
+        if self.valid_set == 'totaltext':
+            im_resized, [ratio_h, ratio_w] = self.resize_image_for_totaltext(
+                img, max_side_len=self.max_side_len)
+        else:
+            im_resized, (ratio_h, ratio_w) = self.resize_image(
+                img, max_side_len=self.max_side_len)
+        data['image'] = im_resized
+        data['shape'] = np.array([src_h, src_w, ratio_h, ratio_w])
+        return data
+
+    def resize_image_for_totaltext(self, im, max_side_len=512):
+
+        h, w, _ = im.shape
+        resize_w = w
+        resize_h = h
+        ratio = 1.25
+        if h * ratio > max_side_len:
+            ratio = float(max_side_len) / resize_h
+        resize_h = int(resize_h * ratio)
+        resize_w = int(resize_w * ratio)
+
+        max_stride = 128
+        resize_h = (resize_h + max_stride - 1) // max_stride * max_stride
+        resize_w = (resize_w + max_stride - 1) // max_stride * max_stride
+        im = cv2.resize(im, (int(resize_w), int(resize_h)))
+        ratio_h = resize_h / float(h)
+        ratio_w = resize_w / float(w)
+        return im, (ratio_h, ratio_w)
+
+    def resize_image(self, im, max_side_len=512):
+        """
+        resize image to a size multiple of max_stride which is required by the network
+        :param im: the resized image
+        :param max_side_len: limit of max image size to avoid out of memory in gpu
+        :return: the resized image and the resize ratio
+        """
+        h, w, _ = im.shape
+
+        resize_w = w
+        resize_h = h
+
+        # Fix the longer side
+        if resize_h > resize_w:
+            ratio = float(max_side_len) / resize_h
+        else:
+            ratio = float(max_side_len) / resize_w
+
+        resize_h = int(resize_h * ratio)
+        resize_w = int(resize_w * ratio)
+
+        max_stride = 128
+        resize_h = (resize_h + max_stride - 1) // max_stride * max_stride
+        resize_w = (resize_w + max_stride - 1) // max_stride * max_stride
+        im = cv2.resize(im, (int(resize_w), int(resize_h)))
+        ratio_h = resize_h / float(h)
+        ratio_w = resize_w / float(w)
+
+        return im, (ratio_h, ratio_w)
+
+
+
+class Pad_to_max_len(object):
+    def __init__(self, **kwargs):
+        super(Pad_to_max_len, self).__init__()
+        self.max_h = kwargs['max_h']
+        self.max_w = kwargs['max_w']
+
+    def __call__(self, data):
+        img = data['image']
+        if img.shape[-1] == 3:
+            # hwc
+            if img.shape[0]!= self.max_h:
+                # TODO support 
+                # assert False, "not support"
+                pad_h = self.max_h - img.shape[0]
+                pad_w = self.max_w - img.shape[1]
+                img = np.pad(img, ((0, pad_h), (0, pad_w), (0, 0)), 'constant', constant_values=0)
+            if img.shape[1] < self.max_w:
+                pad_w = self.max_w - img.shape[1]
+                img = np.pad(img, ((0, 0), (0, pad_w), (0, 0)), 'constant', constant_values=0)
+
+        elif img.shape[0] == 3:
+            # chw
+            img = img.transpose((1, 2, 0))
+            if img.shape[1]!= self.max_h:
+                # TODO support 
+                assert False, "not support"
+            if img.shape[0] < self.max_w:
+                pad_w = self.max_w - img.shape[0]
+                img = np.pad(img, ((0, 0), (0, 0), (0, pad_w)), 'constant', constant_values=0)
+
+        else:
+            assert False, "not support"
+
+        data['image'] = img
+
+        return data
\ No newline at end of file
diff --git a/examples/PPOCR/PPOCR-Rec/python/utils/rec_postprocess.py b/examples/PPOCR/PPOCR-Rec/python/utils/rec_postprocess.py
new file mode 100644
index 0000000..3aa3585
--- /dev/null
+++ b/examples/PPOCR/PPOCR-Rec/python/utils/rec_postprocess.py
@@ -0,0 +1,814 @@
+# copyright (c) 2020 PaddlePaddle Authors. All Rights Reserve.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import numpy as np
+# import paddle
+# from paddle.nn import functional as F
+import re
+
+
+class BaseRecLabelDecode(object):
+    """ Convert between text-label and text-index """
+
+    def __init__(self, character_dict_path=None, use_space_char=False):
+        self.beg_str = "sos"
+        self.end_str = "eos"
+
+        self.character_str = []
+        if character_dict_path is None:
+            self.character_str = "0123456789abcdefghijklmnopqrstuvwxyz"
+            dict_character = list(self.character_str)
+        else:
+            with open(character_dict_path, "rb") as fin:
+                lines = fin.readlines()
+                for line in lines:
+                    line = line.decode('utf-8').strip("\n").strip("\r\n")
+                    self.character_str.append(line)
+            if use_space_char:
+                self.character_str.append(" ")
+            dict_character = list(self.character_str)
+
+        dict_character = self.add_special_char(dict_character)
+        self.dict = {}
+        for i, char in enumerate(dict_character):
+            self.dict[char] = i
+        self.character = dict_character
+
+        if 'arabic' in character_dict_path:
+            self.reverse = True
+        else:
+            self.reverse = False
+
+    def pred_reverse(self, pred):
+        pred_re = []
+        c_current = ''
+        for c in pred:
+            if not bool(re.search('[a-zA-Z0-9 :*./%+-]', c)):
+                if c_current != '':
+                    pred_re.append(c_current)
+                pred_re.append(c)
+                c_current = ''
+            else:
+                c_current += c
+        if c_current != '':
+            pred_re.append(c_current)
+
+        return ''.join(pred_re[::-1])
+
+    def add_special_char(self, dict_character):
+        return dict_character
+
+    def decode(self, text_index, text_prob=None, is_remove_duplicate=False):
+        """ convert text-index into text-label. """
+        result_list = []
+        ignored_tokens = self.get_ignored_tokens()
+        batch_size = len(text_index)
+        for batch_idx in range(batch_size):
+            selection = np.ones(len(text_index[batch_idx]), dtype=bool)
+            if is_remove_duplicate:
+                selection[1:] = text_index[batch_idx][1:] != text_index[
+                    batch_idx][:-1]
+            for ignored_token in ignored_tokens:
+                selection &= text_index[batch_idx] != ignored_token
+
+            char_list = [
+                self.character[text_id]
+                for text_id in text_index[batch_idx][selection]
+            ]
+            if text_prob is not None:
+                conf_list = text_prob[batch_idx][selection]
+            else:
+                conf_list = [1] * len(selection)
+            if len(conf_list) == 0:
+                conf_list = [0]
+
+            text = ''.join(char_list)
+
+            if self.reverse:  # for arabic rec
+                text = self.pred_reverse(text)
+
+            result_list.append((text, np.mean(conf_list).tolist()))
+        return result_list
+
+    def get_ignored_tokens(self):
+        return [0]  # for ctc blank
+
+
+class CTCLabelDecode(BaseRecLabelDecode):
+    """ Convert between text-label and text-index """
+
+    def __init__(self, character_dict_path=None, use_space_char=False,
+                 **kwargs):
+        super(CTCLabelDecode, self).__init__(character_dict_path,
+                                             use_space_char)
+
+    def __call__(self, preds, label=None, *args, **kwargs):
+        if isinstance(preds, tuple) or isinstance(preds, list):
+            preds = preds[-1]
+        # if isinstance(preds, paddle.Tensor):
+        #     preds = preds.numpy()
+        preds_idx = preds.argmax(axis=2)
+        preds_prob = preds.max(axis=2)
+        text = self.decode(preds_idx, preds_prob, is_remove_duplicate=True)
+        if label is None:
+            return text
+        label = self.decode(label)
+        return text, label
+
+    def add_special_char(self, dict_character):
+        dict_character = ['blank'] + dict_character
+        return dict_character
+
+
+class DistillationCTCLabelDecode(CTCLabelDecode):
+    """
+    Convert 
+    Convert between text-label and text-index
+    """
+
+    def __init__(self,
+                 character_dict_path=None,
+                 use_space_char=False,
+                 model_name=["student"],
+                 key=None,
+                 multi_head=False,
+                 **kwargs):
+        super(DistillationCTCLabelDecode, self).__init__(character_dict_path,
+                                                         use_space_char)
+        if not isinstance(model_name, list):
+            model_name = [model_name]
+        self.model_name = model_name
+
+        self.key = key
+        self.multi_head = multi_head
+
+    def __call__(self, preds, label=None, *args, **kwargs):
+        output = dict()
+        for name in self.model_name:
+            pred = preds[name]
+            if self.key is not None:
+                pred = pred[self.key]
+            if self.multi_head and isinstance(pred, dict):
+                pred = pred['ctc']
+            output[name] = super().__call__(pred, label=label, *args, **kwargs)
+        return output
+
+
+class AttnLabelDecode(BaseRecLabelDecode):
+    """ Convert between text-label and text-index """
+
+    def __init__(self, character_dict_path=None, use_space_char=False,
+                 **kwargs):
+        super(AttnLabelDecode, self).__init__(character_dict_path,
+                                              use_space_char)
+
+    def add_special_char(self, dict_character):
+        self.beg_str = "sos"
+        self.end_str = "eos"
+        dict_character = dict_character
+        dict_character = [self.beg_str] + dict_character + [self.end_str]
+        return dict_character
+
+    def decode(self, text_index, text_prob=None, is_remove_duplicate=False):
+        """ convert text-index into text-label. """
+        result_list = []
+        ignored_tokens = self.get_ignored_tokens()
+        [beg_idx, end_idx] = self.get_ignored_tokens()
+        batch_size = len(text_index)
+        for batch_idx in range(batch_size):
+            char_list = []
+            conf_list = []
+            for idx in range(len(text_index[batch_idx])):
+                if text_index[batch_idx][idx] in ignored_tokens:
+                    continue
+                if int(text_index[batch_idx][idx]) == int(end_idx):
+                    break
+                if is_remove_duplicate:
+                    # only for predict
+                    if idx > 0 and text_index[batch_idx][idx - 1] == text_index[
+                            batch_idx][idx]:
+                        continue
+                char_list.append(self.character[int(text_index[batch_idx][
+                    idx])])
+                if text_prob is not None:
+                    conf_list.append(text_prob[batch_idx][idx])
+                else:
+                    conf_list.append(1)
+            text = ''.join(char_list)
+            result_list.append((text, np.mean(conf_list).tolist()))
+        return result_list
+
+    def __call__(self, preds, label=None, *args, **kwargs):
+        """
+        text = self.decode(text)
+        if label is None:
+            return text
+        else:
+            label = self.decode(label, is_remove_duplicate=False)
+            return text, label
+        """
+        # if isinstance(preds, paddle.Tensor):
+        #     preds = preds.numpy()
+
+        preds_idx = preds.argmax(axis=2)
+        preds_prob = preds.max(axis=2)
+        text = self.decode(preds_idx, preds_prob, is_remove_duplicate=False)
+        if label is None:
+            return text
+        label = self.decode(label, is_remove_duplicate=False)
+        return text, label
+
+    def get_ignored_tokens(self):
+        beg_idx = self.get_beg_end_flag_idx("beg")
+        end_idx = self.get_beg_end_flag_idx("end")
+        return [beg_idx, end_idx]
+
+    def get_beg_end_flag_idx(self, beg_or_end):
+        if beg_or_end == "beg":
+            idx = np.array(self.dict[self.beg_str])
+        elif beg_or_end == "end":
+            idx = np.array(self.dict[self.end_str])
+        else:
+            assert False, "unsupport type %s in get_beg_end_flag_idx" \
+                          % beg_or_end
+        return idx
+
+
+class SEEDLabelDecode(BaseRecLabelDecode):
+    """ Convert between text-label and text-index """
+
+    def __init__(self, character_dict_path=None, use_space_char=False,
+                 **kwargs):
+        super(SEEDLabelDecode, self).__init__(character_dict_path,
+                                              use_space_char)
+
+    def add_special_char(self, dict_character):
+        self.padding_str = "padding"
+        self.end_str = "eos"
+        self.unknown = "unknown"
+        dict_character = dict_character + [
+            self.end_str, self.padding_str, self.unknown
+        ]
+        return dict_character
+
+    def get_ignored_tokens(self):
+        end_idx = self.get_beg_end_flag_idx("eos")
+        return [end_idx]
+
+    def get_beg_end_flag_idx(self, beg_or_end):
+        if beg_or_end == "sos":
+            idx = np.array(self.dict[self.beg_str])
+        elif beg_or_end == "eos":
+            idx = np.array(self.dict[self.end_str])
+        else:
+            assert False, "unsupport type %s in get_beg_end_flag_idx" % beg_or_end
+        return idx
+
+    def decode(self, text_index, text_prob=None, is_remove_duplicate=False):
+        """ convert text-index into text-label. """
+        result_list = []
+        [end_idx] = self.get_ignored_tokens()
+        batch_size = len(text_index)
+        for batch_idx in range(batch_size):
+            char_list = []
+            conf_list = []
+            for idx in range(len(text_index[batch_idx])):
+                if int(text_index[batch_idx][idx]) == int(end_idx):
+                    break
+                if is_remove_duplicate:
+                    # only for predict
+                    if idx > 0 and text_index[batch_idx][idx - 1] == text_index[
+                            batch_idx][idx]:
+                        continue
+                char_list.append(self.character[int(text_index[batch_idx][
+                    idx])])
+                if text_prob is not None:
+                    conf_list.append(text_prob[batch_idx][idx])
+                else:
+                    conf_list.append(1)
+            text = ''.join(char_list)
+            result_list.append((text, np.mean(conf_list).tolist()))
+        return result_list
+
+    def __call__(self, preds, label=None, *args, **kwargs):
+        """
+        text = self.decode(text)
+        if label is None:
+            return text
+        else:
+            label = self.decode(label, is_remove_duplicate=False)
+            return text, label
+        """
+        preds_idx = preds["rec_pred"]
+        # if isinstance(preds_idx, paddle.Tensor):
+        #     preds_idx = preds_idx.numpy()
+        if "rec_pred_scores" in preds:
+            preds_idx = preds["rec_pred"]
+            preds_prob = preds["rec_pred_scores"]
+        else:
+            preds_idx = preds["rec_pred"].argmax(axis=2)
+            preds_prob = preds["rec_pred"].max(axis=2)
+        text = self.decode(preds_idx, preds_prob, is_remove_duplicate=False)
+        if label is None:
+            return text
+        label = self.decode(label, is_remove_duplicate=False)
+        return text, label
+
+
+class SRNLabelDecode(BaseRecLabelDecode):
+    """ Convert between text-label and text-index """
+
+    def __init__(self, character_dict_path=None, use_space_char=False,
+                 **kwargs):
+        super(SRNLabelDecode, self).__init__(character_dict_path,
+                                             use_space_char)
+        self.max_text_length = kwargs.get('max_text_length', 25)
+
+    def __call__(self, preds, label=None, *args, **kwargs):
+        pred = preds['predict']
+        char_num = len(self.character_str) + 2
+        # if isinstance(pred, paddle.Tensor):
+        #     pred = pred.numpy()
+        pred = np.reshape(pred, [-1, char_num])
+
+        preds_idx = np.argmax(pred, axis=1)
+        preds_prob = np.max(pred, axis=1)
+
+        preds_idx = np.reshape(preds_idx, [-1, self.max_text_length])
+
+        preds_prob = np.reshape(preds_prob, [-1, self.max_text_length])
+
+        text = self.decode(preds_idx, preds_prob)
+
+        if label is None:
+            text = self.decode(preds_idx, preds_prob, is_remove_duplicate=False)
+            return text
+        label = self.decode(label)
+        return text, label
+
+    def decode(self, text_index, text_prob=None, is_remove_duplicate=False):
+        """ convert text-index into text-label. """
+        result_list = []
+        ignored_tokens = self.get_ignored_tokens()
+        batch_size = len(text_index)
+
+        for batch_idx in range(batch_size):
+            char_list = []
+            conf_list = []
+            for idx in range(len(text_index[batch_idx])):
+                if text_index[batch_idx][idx] in ignored_tokens:
+                    continue
+                if is_remove_duplicate:
+                    # only for predict
+                    if idx > 0 and text_index[batch_idx][idx - 1] == text_index[
+                            batch_idx][idx]:
+                        continue
+                char_list.append(self.character[int(text_index[batch_idx][
+                    idx])])
+                if text_prob is not None:
+                    conf_list.append(text_prob[batch_idx][idx])
+                else:
+                    conf_list.append(1)
+
+            text = ''.join(char_list)
+            result_list.append((text, np.mean(conf_list).tolist()))
+        return result_list
+
+    def add_special_char(self, dict_character):
+        dict_character = dict_character + [self.beg_str, self.end_str]
+        return dict_character
+
+    def get_ignored_tokens(self):
+        beg_idx = self.get_beg_end_flag_idx("beg")
+        end_idx = self.get_beg_end_flag_idx("end")
+        return [beg_idx, end_idx]
+
+    def get_beg_end_flag_idx(self, beg_or_end):
+        if beg_or_end == "beg":
+            idx = np.array(self.dict[self.beg_str])
+        elif beg_or_end == "end":
+            idx = np.array(self.dict[self.end_str])
+        else:
+            assert False, "unsupport type %s in get_beg_end_flag_idx" \
+                          % beg_or_end
+        return idx
+
+
+class SARLabelDecode(BaseRecLabelDecode):
+    """ Convert between text-label and text-index """
+
+    def __init__(self, character_dict_path=None, use_space_char=False,
+                 **kwargs):
+        super(SARLabelDecode, self).__init__(character_dict_path,
+                                             use_space_char)
+
+        self.rm_symbol = kwargs.get('rm_symbol', False)
+
+    def add_special_char(self, dict_character):
+        beg_end_str = "<BOS/EOS>"
+        unknown_str = "<UKN>"
+        padding_str = "<PAD>"
+        dict_character = dict_character + [unknown_str]
+        self.unknown_idx = len(dict_character) - 1
+        dict_character = dict_character + [beg_end_str]
+        self.start_idx = len(dict_character) - 1
+        self.end_idx = len(dict_character) - 1
+        dict_character = dict_character + [padding_str]
+        self.padding_idx = len(dict_character) - 1
+        return dict_character
+
+    def decode(self, text_index, text_prob=None, is_remove_duplicate=False):
+        """ convert text-index into text-label. """
+        result_list = []
+        ignored_tokens = self.get_ignored_tokens()
+
+        batch_size = len(text_index)
+        for batch_idx in range(batch_size):
+            char_list = []
+            conf_list = []
+            for idx in range(len(text_index[batch_idx])):
+                if text_index[batch_idx][idx] in ignored_tokens:
+                    continue
+                if int(text_index[batch_idx][idx]) == int(self.end_idx):
+                    if text_prob is None and idx == 0:
+                        continue
+                    else:
+                        break
+                if is_remove_duplicate:
+                    # only for predict
+                    if idx > 0 and text_index[batch_idx][idx - 1] == text_index[
+                            batch_idx][idx]:
+                        continue
+                char_list.append(self.character[int(text_index[batch_idx][
+                    idx])])
+                if text_prob is not None:
+                    conf_list.append(text_prob[batch_idx][idx])
+                else:
+                    conf_list.append(1)
+            text = ''.join(char_list)
+            if self.rm_symbol:
+                comp = re.compile('[^A-Z^a-z^0-9^\u4e00-\u9fa5]')
+                text = text.lower()
+                text = comp.sub('', text)
+            result_list.append((text, np.mean(conf_list).tolist()))
+        return result_list
+
+    def __call__(self, preds, label=None, *args, **kwargs):
+        # if isinstance(preds, paddle.Tensor):
+        #     preds = preds.numpy()
+        preds_idx = preds.argmax(axis=2)
+        preds_prob = preds.max(axis=2)
+
+        text = self.decode(preds_idx, preds_prob, is_remove_duplicate=False)
+
+        if label is None:
+            return text
+        label = self.decode(label, is_remove_duplicate=False)
+        return text, label
+
+    def get_ignored_tokens(self):
+        return [self.padding_idx]
+
+
+class DistillationSARLabelDecode(SARLabelDecode):
+    """
+    Convert 
+    Convert between text-label and text-index
+    """
+
+    def __init__(self,
+                 character_dict_path=None,
+                 use_space_char=False,
+                 model_name=["student"],
+                 key=None,
+                 multi_head=False,
+                 **kwargs):
+        super(DistillationSARLabelDecode, self).__init__(character_dict_path,
+                                                         use_space_char)
+        if not isinstance(model_name, list):
+            model_name = [model_name]
+        self.model_name = model_name
+
+        self.key = key
+        self.multi_head = multi_head
+
+    def __call__(self, preds, label=None, *args, **kwargs):
+        output = dict()
+        for name in self.model_name:
+            pred = preds[name]
+            if self.key is not None:
+                pred = pred[self.key]
+            if self.multi_head and isinstance(pred, dict):
+                pred = pred['sar']
+            output[name] = super().__call__(pred, label=label, *args, **kwargs)
+        return output
+
+
+class PRENLabelDecode(BaseRecLabelDecode):
+    """ Convert between text-label and text-index """
+
+    def __init__(self, character_dict_path=None, use_space_char=False,
+                 **kwargs):
+        super(PRENLabelDecode, self).__init__(character_dict_path,
+                                              use_space_char)
+
+    def add_special_char(self, dict_character):
+        padding_str = '<PAD>'  # 0 
+        end_str = '<EOS>'  # 1
+        unknown_str = '<UNK>'  # 2
+
+        dict_character = [padding_str, end_str, unknown_str] + dict_character
+        self.padding_idx = 0
+        self.end_idx = 1
+        self.unknown_idx = 2
+
+        return dict_character
+
+    def decode(self, text_index, text_prob=None):
+        """ convert text-index into text-label. """
+        result_list = []
+        batch_size = len(text_index)
+
+        for batch_idx in range(batch_size):
+            char_list = []
+            conf_list = []
+            for idx in range(len(text_index[batch_idx])):
+                if text_index[batch_idx][idx] == self.end_idx:
+                    break
+                if text_index[batch_idx][idx] in \
+                    [self.padding_idx, self.unknown_idx]:
+                    continue
+                char_list.append(self.character[int(text_index[batch_idx][
+                    idx])])
+                if text_prob is not None:
+                    conf_list.append(text_prob[batch_idx][idx])
+                else:
+                    conf_list.append(1)
+
+            text = ''.join(char_list)
+            if len(text) > 0:
+                result_list.append((text, np.mean(conf_list).tolist()))
+            else:
+                # here confidence of empty recog result is 1
+                result_list.append(('', 1))
+        return result_list
+
+    def __call__(self, preds, label=None, *args, **kwargs):
+        preds = preds.numpy()
+        preds_idx = preds.argmax(axis=2)
+        preds_prob = preds.max(axis=2)
+        text = self.decode(preds_idx, preds_prob)
+        if label is None:
+            return text
+        label = self.decode(label)
+        return text, label
+
+
+class NRTRLabelDecode(BaseRecLabelDecode):
+    """ Convert between text-label and text-index """
+
+    def __init__(self, character_dict_path=None, use_space_char=True, **kwargs):
+        super(NRTRLabelDecode, self).__init__(character_dict_path,
+                                              use_space_char)
+
+    def __call__(self, preds, label=None, *args, **kwargs):
+
+        if len(preds) == 2:
+            preds_id = preds[0]
+            preds_prob = preds[1]
+            # if isinstance(preds_id, paddle.Tensor):
+            #     preds_id = preds_id.numpy()
+            # if isinstance(preds_prob, paddle.Tensor):
+            #     preds_prob = preds_prob.numpy()
+            if preds_id[0][0] == 2:
+                preds_idx = preds_id[:, 1:]
+                preds_prob = preds_prob[:, 1:]
+            else:
+                preds_idx = preds_id
+            text = self.decode(preds_idx, preds_prob, is_remove_duplicate=False)
+            if label is None:
+                return text
+            label = self.decode(label[:, 1:])
+        else:
+            # if isinstance(preds, paddle.Tensor):
+            #     preds = preds.numpy()
+            preds_idx = preds.argmax(axis=2)
+            preds_prob = preds.max(axis=2)
+            text = self.decode(preds_idx, preds_prob, is_remove_duplicate=False)
+            if label is None:
+                return text
+            label = self.decode(label[:, 1:])
+        return text, label
+
+    def add_special_char(self, dict_character):
+        dict_character = ['blank', '<unk>', '<s>', '</s>'] + dict_character
+        return dict_character
+
+    def decode(self, text_index, text_prob=None, is_remove_duplicate=False):
+        """ convert text-index into text-label. """
+        result_list = []
+        batch_size = len(text_index)
+        for batch_idx in range(batch_size):
+            char_list = []
+            conf_list = []
+            for idx in range(len(text_index[batch_idx])):
+                try:
+                    char_idx = self.character[int(text_index[batch_idx][idx])]
+                except:
+                    continue
+                if char_idx == '</s>':  # end
+                    break
+                char_list.append(char_idx)
+                if text_prob is not None:
+                    conf_list.append(text_prob[batch_idx][idx])
+                else:
+                    conf_list.append(1)
+            text = ''.join(char_list)
+            result_list.append((text.lower(), np.mean(conf_list).tolist()))
+        return result_list
+
+
+class ViTSTRLabelDecode(NRTRLabelDecode):
+    """ Convert between text-label and text-index """
+
+    def __init__(self, character_dict_path=None, use_space_char=False,
+                 **kwargs):
+        super(ViTSTRLabelDecode, self).__init__(character_dict_path,
+                                                use_space_char)
+
+    def __call__(self, preds, label=None, *args, **kwargs):
+        # if isinstance(preds, paddle.Tensor):
+        #     preds = preds[:, 1:].numpy()
+        # else:
+        #     preds = preds[:, 1:]
+        preds = preds[:, 1:].numpy()
+        preds_idx = preds.argmax(axis=2)
+        preds_prob = preds.max(axis=2)
+        text = self.decode(preds_idx, preds_prob, is_remove_duplicate=False)
+        if label is None:
+            return text
+        label = self.decode(label[:, 1:])
+        return text, label
+
+    def add_special_char(self, dict_character):
+        dict_character = ['<s>', '</s>'] + dict_character
+        return dict_character
+
+
+class ABINetLabelDecode(NRTRLabelDecode):
+    """ Convert between text-label and text-index """
+
+    def __init__(self, character_dict_path=None, use_space_char=False,
+                 **kwargs):
+        super(ABINetLabelDecode, self).__init__(character_dict_path,
+                                                use_space_char)
+
+    def __call__(self, preds, label=None, *args, **kwargs):
+        if isinstance(preds, dict):
+            preds = preds['align'][-1].numpy()
+        # elif isinstance(preds, paddle.Tensor):
+        #     preds = preds.numpy()
+        # else:
+        #     preds = preds
+        preds = preds.numpy()
+        preds_idx = preds.argmax(axis=2)
+        preds_prob = preds.max(axis=2)
+        text = self.decode(preds_idx, preds_prob, is_remove_duplicate=False)
+        if label is None:
+            return text
+        label = self.decode(label)
+        return text, label
+
+    def add_special_char(self, dict_character):
+        dict_character = ['</s>'] + dict_character
+        return dict_character
+
+
+class SPINLabelDecode(AttnLabelDecode):
+    """ Convert between text-label and text-index """
+
+    def __init__(self, character_dict_path=None, use_space_char=False,
+                 **kwargs):
+        super(SPINLabelDecode, self).__init__(character_dict_path,
+                                              use_space_char)
+
+    def add_special_char(self, dict_character):
+        self.beg_str = "sos"
+        self.end_str = "eos"
+        dict_character = dict_character
+        dict_character = [self.beg_str] + [self.end_str] + dict_character
+        return dict_character
+
+
+# class VLLabelDecode(BaseRecLabelDecode):
+#     """ Convert between text-label and text-index """
+
+#     def __init__(self, character_dict_path=None, use_space_char=False,
+#                  **kwargs):
+#         super(VLLabelDecode, self).__init__(character_dict_path, use_space_char)
+#         self.max_text_length = kwargs.get('max_text_length', 25)
+#         self.nclass = len(self.character) + 1
+#         self.character = self.character[10:] + self.character[
+#             1:10] + [self.character[0]]
+
+#     def decode(self, text_index, text_prob=None, is_remove_duplicate=False):
+#         """ convert text-index into text-label. """
+#         result_list = []
+#         ignored_tokens = self.get_ignored_tokens()
+#         batch_size = len(text_index)
+#         for batch_idx in range(batch_size):
+#             selection = np.ones(len(text_index[batch_idx]), dtype=bool)
+#             if is_remove_duplicate:
+#                 selection[1:] = text_index[batch_idx][1:] != text_index[
+#                     batch_idx][:-1]
+#             for ignored_token in ignored_tokens:
+#                 selection &= text_index[batch_idx] != ignored_token
+
+#             char_list = [
+#                 self.character[text_id - 1]
+#                 for text_id in text_index[batch_idx][selection]
+#             ]
+#             if text_prob is not None:
+#                 conf_list = text_prob[batch_idx][selection]
+#             else:
+#                 conf_list = [1] * len(selection)
+#             if len(conf_list) == 0:
+#                 conf_list = [0]
+
+#             text = ''.join(char_list)
+#             result_list.append((text, np.mean(conf_list).tolist()))
+#         return result_list
+
+#     def __call__(self, preds, label=None, length=None, *args, **kwargs):
+#         if len(preds) == 2:  # eval mode
+#             text_pre, x = preds
+#             b = text_pre.shape[1]
+#             lenText = self.max_text_length
+#             nsteps = self.max_text_length
+
+#             if not isinstance(text_pre, paddle.Tensor):
+#                 text_pre = paddle.to_tensor(text_pre, dtype='float32')
+
+#             out_res = paddle.zeros(
+#                 shape=[lenText, b, self.nclass], dtype=x.dtype)
+#             out_length = paddle.zeros(shape=[b], dtype=x.dtype)
+#             now_step = 0
+#             for _ in range(nsteps):
+#                 if 0 in out_length and now_step < nsteps:
+#                     tmp_result = text_pre[now_step, :, :]
+#                     out_res[now_step] = tmp_result
+#                     tmp_result = tmp_result.topk(1)[1].squeeze(axis=1)
+#                     for j in range(b):
+#                         if out_length[j] == 0 and tmp_result[j] == 0:
+#                             out_length[j] = now_step + 1
+#                     now_step += 1
+#             for j in range(0, b):
+#                 if int(out_length[j]) == 0:
+#                     out_length[j] = nsteps
+#             start = 0
+#             output = paddle.zeros(
+#                 shape=[int(out_length.sum()), self.nclass], dtype=x.dtype)
+#             for i in range(0, b):
+#                 cur_length = int(out_length[i])
+#                 output[start:start + cur_length] = out_res[0:cur_length, i, :]
+#                 start += cur_length
+#             net_out = output
+#             length = out_length
+
+#         else:  # train mode
+#             net_out = preds[0]
+#             length = length
+#             net_out = paddle.concat([t[:l] for t, l in zip(net_out, length)])
+#         text = []
+#         if not isinstance(net_out, paddle.Tensor):
+#             net_out = paddle.to_tensor(net_out, dtype='float32')
+#         net_out = F.softmax(net_out, axis=1)
+#         for i in range(0, length.shape[0]):
+#             preds_idx = net_out[int(length[:i].sum()):int(length[:i].sum(
+#             ) + length[i])].topk(1)[1][:, 0].tolist()
+#             preds_text = ''.join([
+#                 self.character[idx - 1]
+#                 if idx > 0 and idx <= len(self.character) else ''
+#                 for idx in preds_idx
+#             ])
+#             preds_prob = net_out[int(length[:i].sum()):int(length[:i].sum(
+#             ) + length[i])].topk(1)[0][:, 0]
+#             preds_prob = paddle.exp(
+#                 paddle.log(preds_prob).sum() / (preds_prob.shape[0] + 1e-6))
+#             text.append((preds_text, preds_prob.numpy()[0]))
+#         if label is None:
+#             return text
+#         label = self.decode(label)
+#         return text, label
+
diff --git a/examples/PPOCR/PPOCR-System/README.md b/examples/PPOCR/PPOCR-System/README.md
index 92d67ee..fe7326c 100644
--- a/examples/PPOCR/PPOCR-System/README.md
+++ b/examples/PPOCR/PPOCR-System/README.md
@@ -1,42 +1,38 @@
 # PPOCR-System
 
+## Current Support Platform
+
+RK3566, RK3568, RK3588, RK3562, RK1808, RV1109, RV1126
+
 
 ## Prepare model
 
 Refer [PPOCR-Det](../PPOCR-Det) and [PPOCR-Rec](../PPOCR-Rec) to get ONNX and RKNN models.
 
-## Script Usage
 
-Install libs:
+## Python Demo
 
-```bash
-pip install -r python/requirements.txt
-```
+*Usage:*
 
-For ONNX:
+```shell
+cd python
 
-```bash
-python python/ppocr_system.py \
-    --image_dir model/test1.jpg \
-    --det_model_dir ../PPOCR-Det/model/ppocrv4_det.onnx \
-    --rec_model_dir ../PPOCR-Rec/model/ppocrv4_rec.onnx \
-    --rec_char_dict_path ../PPOCR-Rec/model/ppocr_keys_v1.txt \
-    --vis_font_path model/simfang.ttf \
-    --use_gpu false --use_onnx true --rec_image_shape "3, 48, 320"
-```
+# Inference with ONNX model
+python ppocr_system.py --det_model_path <onnx_model> --rec_model_path <onnx_model>
+# such as: python ppocr_system.py --det_model_path ../../PPOCR-Det/model/ppocrv4_det.onnx --rec_model_path ../../PPOCR-Rec/model/ppocrv4_rec.onnx
 
-For RKNN:
-```bash
-python python/ppocr_system.py \
-    --image_dir model/test1.jpg \
-    --det_model_dir ../PPOCR-Det/model/ppocrv4_det.rknn \
-    --rec_model_dir ../PPOCR-Rec/model/ppocrv4_rec.rknn \
-    --rec_char_dict_path ../PPOCR-Rec/model/ppocr_keys_v1.txt \
-    --vis_font_path model/simfang.ttf \
-    --use_gpu false --use_rknn true --platform rk3568 --det_image_shape 480 480 --rec_image_shape "3, 48, 320"
+# Inference with RKNN model
+python ppocr_system.py --det_model_path <rknn_model> --rec_model_path <rknn_model> --target <TARGET_PLATFORM>
+# such as: python ppocr_system.py --det_model_path ../../PPOCR-Det/model/ppocrv4_det.rknn --rec_model_path ../../PPOCR-Rec/model/ppocrv4_rec.rknn --target rk3588
 ```
+*Description:*
+- <TARGET_PLATFORM>: Specify NPU platform name. Such as 'rk3588'.
+
+- <onnx_model / rknn_model>: specified as the model path.
+
 
 ## Android Demo
+**Note: RK1808, RV1109, RV1126 does not support Android.**
 
 ### Compiling && Building
 
@@ -72,7 +68,7 @@ adb shell
 cd /data/rknn_PPOCR-System_demo
 
 export LD_LIBRARY_PATH=./lib
-./rknn_ppocr_system_demo model/ppocrv4_det.rknn model/ppocrv4_rec.rknn model/test1.jpg
+./rknn_ppocr_system_demo model/ppocrv4_det.rknn model/ppocrv4_rec.rknn model/test.jpg
 ```
 
 ## Aarch64 Linux Demo
@@ -115,7 +111,7 @@ adb shell
 cd /data/rknn_PPOCR-System_demo
 
 export LD_LIBRARY_PATH=./lib
-./rknn_ppocr_system_demo model/ppocrv4_det.rknn model/ppocrv4_rec.rknn model/test1.jpg
+./rknn_ppocr_system_demo model/ppocrv4_det.rknn model/ppocrv4_rec.rknn model/test.jpg
 ```
 
 Note: Try searching the location of librga.so and add it to LD_LIBRARY_PATH if the librga.so is not found in the lib folder.
diff --git a/examples/PPOCR/PPOCR-System/cpp/CMakeLists.txt b/examples/PPOCR/PPOCR-System/cpp/CMakeLists.txt
index 5d94d7f..bf01e3d 100644
--- a/examples/PPOCR/PPOCR-System/cpp/CMakeLists.txt
+++ b/examples/PPOCR/PPOCR-System/cpp/CMakeLists.txt
@@ -37,13 +37,20 @@ message(STATUS OpenCV_LIBS=${OpenCV_LIBS})
 
 set(CMAKE_INSTALL_RPATH "$ORIGIN/../lib")
 
+if (TARGET_SOC STREQUAL "rk1808" OR TARGET_SOC STREQUAL "rv1109" OR TARGET_SOC STREQUAL "rv1126")
+    set(ppocr_system_file rknpu1/ppocr_system.cc)
+else()
+    set(ppocr_system_file rknpu2/ppocr_system.cc)
+endif()
+
+
 file(GLOB SRCS ${CMAKE_CURRENT_SOURCE_DIR}/*.cc)
 
 add_executable(${PROJECT_NAME}
     main.cc
     postprocess.cc
     clipper.cc
-    rknpu2/ppocr_system.cc
+    ${ppocr_system_file}
 )
 
 target_link_libraries(${PROJECT_NAME}
@@ -71,7 +78,7 @@ target_include_directories(${PROJECT_NAME} PRIVATE
 )
 
 install(TARGETS ${PROJECT_NAME} DESTINATION .)
-install(FILES ${CMAKE_CURRENT_SOURCE_DIR}/../model/test1.jpg DESTINATION model)
+install(FILES ${CMAKE_CURRENT_SOURCE_DIR}/../model/test.jpg DESTINATION model)
 set(file_path ${CMAKE_CURRENT_SOURCE_DIR}/../../PPOCR-Det/model/ppocrv4_det.rknn)
 if (EXISTS ${file_path})
     install(FILES ${file_path} DESTINATION model)
diff --git a/examples/PPOCR/PPOCR-System/cpp/rknpu1/ppocr_system.cc b/examples/PPOCR/PPOCR-System/cpp/rknpu1/ppocr_system.cc
new file mode 100644
index 0000000..6f46111
--- /dev/null
+++ b/examples/PPOCR/PPOCR-System/cpp/rknpu1/ppocr_system.cc
@@ -0,0 +1,449 @@
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+#include <math.h>
+#include <vector>
+
+#include "opencv2/opencv.hpp"
+#include "ppocr_system.h"
+#include "common.h"
+#include "file_utils.h"
+#include "image_utils.h"
+
+bool CompareBox(const std::array<int, 8>& result1, const std::array<int, 8>& result2)
+{
+    if (result1[1] < result2[1]) 
+    {
+        return true;
+    } else if (result1[1] == result2[1]) 
+    {
+        return result1[0] < result2[0];
+    } else 
+    {
+        return false;
+    }
+}
+
+void SortBoxes(std::vector<std::array<int, 8>>* boxes)
+{
+    std::sort(boxes->begin(), boxes->end(), CompareBox);
+
+    if (boxes->size() == 0)
+    {
+        return;
+    }
+    
+    for (int i = 0; i < boxes->size() - 1; i++) {
+        for (int j = i; j >=0 ; j--){
+            if (std::abs((*boxes)[j + 1][1] - (*boxes)[j][1]) < 10 && ((*boxes)[j + 1][0] < (*boxes)[j][0])) 
+            {
+                std::swap((*boxes)[i], (*boxes)[i + 1]);
+            }
+        }
+    }
+
+}
+
+cv::Mat GetRotateCropImage(const cv::Mat& srcimage, const std::array<int, 8>& box)
+{
+    cv::Mat image;
+    srcimage.copyTo(image);
+
+    std::vector<std::vector<int>> points;
+
+    for (int i = 0; i < 4; ++i) {
+        std::vector<int> tmp;
+        tmp.push_back(box[2 * i]);
+        tmp.push_back(box[2 * i + 1]);
+        points.push_back(tmp);
+    }
+    int x_collect[4] = {box[0], box[2], box[4], box[6]};
+    int y_collect[4] = {box[1], box[3], box[5], box[7]};
+    int left = int(*std::min_element(x_collect, x_collect + 4));
+    int right = int(*std::max_element(x_collect, x_collect + 4));
+    int top = int(*std::min_element(y_collect, y_collect + 4));
+    int bottom = int(*std::max_element(y_collect, y_collect + 4));
+
+    cv::Mat img_crop;
+    image(cv::Rect(left, top, right - left, bottom - top)).copyTo(img_crop);
+
+    for (int i = 0; i < points.size(); i++) {
+        points[i][0] -= left;
+        points[i][1] -= top;
+    }
+
+    int img_crop_width = int(sqrt(pow(points[0][0] - points[1][0], 2) +
+                                    pow(points[0][1] - points[1][1], 2)));
+    int img_crop_height = int(sqrt(pow(points[0][0] - points[3][0], 2) +
+                                    pow(points[0][1] - points[3][1], 2)));
+
+    cv::Point2f pts_std[4];
+    pts_std[0] = cv::Point2f(0., 0.);
+    pts_std[1] = cv::Point2f(img_crop_width, 0.);
+    pts_std[2] = cv::Point2f(img_crop_width, img_crop_height);
+    pts_std[3] = cv::Point2f(0.f, img_crop_height);
+
+    cv::Point2f pointsf[4];
+    pointsf[0] = cv::Point2f(points[0][0], points[0][1]);
+    pointsf[1] = cv::Point2f(points[1][0], points[1][1]);
+    pointsf[2] = cv::Point2f(points[2][0], points[2][1]);
+    pointsf[3] = cv::Point2f(points[3][0], points[3][1]);
+
+    cv::Mat M = cv::getPerspectiveTransform(pointsf, pts_std);
+
+    cv::Mat dst_img;
+    cv::warpPerspective(img_crop, dst_img, M,
+                        cv::Size(img_crop_width, img_crop_height),
+                        cv::BORDER_REPLICATE);
+
+    if (float(dst_img.rows) >= float(dst_img.cols) * 1.5) {
+        cv::Mat srcCopy = cv::Mat(dst_img.rows, dst_img.cols, dst_img.depth());
+        cv::transpose(dst_img, srcCopy);
+        cv::flip(srcCopy, srcCopy, 0);
+        return srcCopy;
+    } else {
+        return dst_img;
+    }
+}
+
+static void dump_tensor_attr(rknn_tensor_attr* attr)
+{
+    printf("  index=%d, name=%s, n_dims=%d, dims=[%d, %d, %d, %d], n_elems=%d, size=%d, fmt=%s, type=%s, qnt_type=%s, "
+            "zp=%d, scale=%f\n",
+            attr->index, attr->name, attr->n_dims, attr->dims[3], attr->dims[2], attr->dims[1], attr->dims[0],
+            attr->n_elems, attr->size, get_format_string(attr->fmt), get_type_string(attr->type),
+            get_qnt_type_string(attr->qnt_type), attr->zp, attr->scale);
+}
+
+int init_ppocr_model(const char* model_path, rknn_app_context_t* app_ctx)
+{
+    int ret;
+    int model_len = 0;
+    char* model;
+    rknn_context ctx = 0;
+
+    // Load RKNN Model
+    model_len = read_data_from_file(model_path, &model);
+    if (model == NULL) {
+        printf("load_model fail!\n");
+        return -1;
+    }
+
+    ret = rknn_init(&ctx, model, model_len, 0);
+    free(model);
+    if (ret < 0) {
+        printf("rknn_init fail! ret=%d\n", ret);
+        return -1;
+    }
+
+    // Get Model Input Output Number
+    rknn_input_output_num io_num;
+    ret = rknn_query(ctx, RKNN_QUERY_IN_OUT_NUM, &io_num, sizeof(io_num));
+    if (ret != RKNN_SUCC) {
+        printf("rknn_query fail! ret=%d\n", ret);
+        return -1;
+    }
+    printf("model input num: %d, output num: %d\n", io_num.n_input, io_num.n_output);
+
+    // Get Model Input Info
+    printf("input tensors:\n");
+    rknn_tensor_attr input_attrs[io_num.n_input];
+    memset(input_attrs, 0, sizeof(input_attrs));
+    for (int i = 0; i < io_num.n_input; i++) {
+        input_attrs[i].index = i;
+        ret = rknn_query(ctx, RKNN_QUERY_INPUT_ATTR, &(input_attrs[i]), sizeof(rknn_tensor_attr));
+        if (ret != RKNN_SUCC) {
+            printf("rknn_query fail! ret=%d\n", ret);
+            return -1;
+        }
+        dump_tensor_attr(&(input_attrs[i]));
+    }
+
+    // Get Model Output Info
+    printf("output tensors:\n");
+    rknn_tensor_attr output_attrs[io_num.n_output];
+    memset(output_attrs, 0, sizeof(output_attrs));
+    for (int i = 0; i < io_num.n_output; i++) {
+        output_attrs[i].index = i;
+        ret = rknn_query(ctx, RKNN_QUERY_OUTPUT_ATTR, &(output_attrs[i]), sizeof(rknn_tensor_attr));
+        if (ret != RKNN_SUCC) {
+            printf("rknn_query fail! ret=%d\n", ret);
+            return -1;
+        }
+        dump_tensor_attr(&(output_attrs[i]));
+    }
+
+    // Set to context
+    app_ctx->rknn_ctx = ctx;
+    app_ctx->io_num = io_num;
+    app_ctx->input_attrs = (rknn_tensor_attr*)malloc(io_num.n_input * sizeof(rknn_tensor_attr));
+    memcpy(app_ctx->input_attrs, input_attrs, io_num.n_input * sizeof(rknn_tensor_attr));
+    app_ctx->output_attrs = (rknn_tensor_attr*)malloc(io_num.n_output * sizeof(rknn_tensor_attr));
+    memcpy(app_ctx->output_attrs, output_attrs, io_num.n_output * sizeof(rknn_tensor_attr));
+
+    if (input_attrs[0].fmt == RKNN_TENSOR_NCHW) {
+        printf("model is NCHW input fmt\n");
+        app_ctx->model_channel = input_attrs[0].dims[2];
+        app_ctx->model_height  = input_attrs[0].dims[1];
+        app_ctx->model_width   = input_attrs[0].dims[0];
+    } else {
+        printf("model is NHWC input fmt\n");
+        app_ctx->model_height = input_attrs[0].dims[2];
+        app_ctx->model_width = input_attrs[0].dims[1];
+        app_ctx->model_channel = input_attrs[0].dims[0];
+    }
+    printf("model input height=%d, width=%d, channel=%d\n",
+        app_ctx->model_height, app_ctx->model_width, app_ctx->model_channel);
+
+    return 0;
+}
+
+int release_ppocr_model(rknn_app_context_t* app_ctx)
+{
+    if (app_ctx->input_attrs != NULL) {
+        free(app_ctx->input_attrs);
+        app_ctx->input_attrs = NULL;
+    }
+    if (app_ctx->output_attrs != NULL) {
+        free(app_ctx->output_attrs);
+        app_ctx->output_attrs = NULL;
+    }
+    if (app_ctx->rknn_ctx != 0) {
+        rknn_destroy(app_ctx->rknn_ctx);
+        app_ctx->rknn_ctx = 0;
+    }
+    return 0;
+}
+
+int inference_ppocr_det_model(rknn_app_context_t* app_ctx, image_buffer_t* src_img, ppocr_det_postprocess_params* params, ppocr_det_result* out_result)
+{
+    int ret;
+    image_buffer_t img;
+    rknn_input inputs[1];
+    rknn_output outputs[1];
+
+    memset(&img, 0, sizeof(image_buffer_t));
+    memset(inputs, 0, sizeof(inputs));
+    memset(outputs, 0, sizeof(outputs));
+
+    // Pre Process
+    img.width = app_ctx->model_width;
+    img.height = app_ctx->model_height;
+    img.format = IMAGE_FORMAT_RGB888;
+    img.size = get_image_size(&img);
+    img.virt_addr = (unsigned char*)malloc(img.size);
+    if (img.virt_addr == NULL) {
+        printf("malloc buffer size:%d fail!\n", img.size);
+        return -1;
+    }
+
+    ret = convert_image(src_img, &img, NULL, NULL, 0);
+    if (ret < 0) {
+        printf("convert_image fail! ret=%d\n", ret);
+        return -1;
+    }
+
+    // Set Input Data
+    inputs[0].index = 0;
+    inputs[0].type  = RKNN_TENSOR_UINT8;
+    inputs[0].fmt   = RKNN_TENSOR_NHWC;
+    inputs[0].size  = app_ctx->model_width * app_ctx->model_height * app_ctx->model_channel;
+    inputs[0].buf   = img.virt_addr;
+
+    float scale_w = (float)src_img->width / (float)img.width;
+    float scale_h = (float)src_img->height / (float)img.height;
+
+    ret = rknn_inputs_set(app_ctx->rknn_ctx, 1, inputs);
+    if (ret < 0) {
+        printf("rknn_input_set fail! ret=%d\n", ret);
+        return -1;
+    }
+
+    // Run
+    // printf("rknn_run\n");
+    ret = rknn_run(app_ctx->rknn_ctx, nullptr);
+    if (ret < 0) {
+        printf("rknn_run fail! ret=%d\n", ret);
+        return -1;
+    }
+
+    // Get Output
+    outputs[0].want_float = 1;
+    ret = rknn_outputs_get(app_ctx->rknn_ctx, 1, outputs, NULL);
+    if (ret < 0) {
+        printf("rknn_outputs_get fail! ret=%d\n", ret);
+        goto out;
+    }
+
+    // Post Process
+    ret = dbnet_postprocess((float*)outputs[0].buf, app_ctx->model_width, app_ctx->model_height, 
+                                                params->threshold, params->box_threshold, params->use_dilate, params->db_score_mode, 
+                                                params->db_unclip_ratio, params->db_box_type,
+                                                scale_w, scale_h, out_result);
+    
+    // Remeber to release rknn output
+    rknn_outputs_release(app_ctx->rknn_ctx, 1, outputs);
+
+out:
+    if (img.virt_addr != NULL) {
+        free(img.virt_addr);
+    }
+
+    return ret;
+}
+
+int inference_ppocr_rec_model(rknn_app_context_t* app_ctx, image_buffer_t* src_img, ppocr_rec_result* out_result)
+{
+    int ret;
+    rknn_input inputs[1];
+    rknn_output outputs[1];
+    int allow_slight_change = 1;
+
+    memset(inputs, 0, sizeof(inputs));
+    memset(outputs, 0, sizeof(outputs));
+
+    // Pre Process
+    float ratio = src_img->width / float(src_img->height);
+    int resized_w;
+    int imgW = app_ctx->model_width, imgH = app_ctx->model_height;
+    if (std::ceil(imgH*ratio) > imgW) {
+        resized_w = imgW;
+    }
+    else {
+        resized_w = std::ceil(imgH*ratio);
+    }
+
+    cv::Mat img_M = cv::Mat(src_img->height, src_img->width, CV_8UC3,(uint8_t*)src_img->virt_addr);
+    cv::resize(img_M, img_M, cv::Size(resized_w, imgH));
+    img_M.convertTo(img_M, CV_32FC3);
+    img_M = (img_M - 127.5)/127.5;
+    if (resized_w < imgW) {
+        copyMakeBorder(img_M, img_M, 0, 0, 0, imgW- resized_w, cv::BORDER_CONSTANT, 0);
+    }
+
+    // Set Input Data
+    inputs[0].index = 0;
+    inputs[0].type  = RKNN_TENSOR_FLOAT32;
+    inputs[0].fmt   = RKNN_TENSOR_NHWC;
+    inputs[0].size  = app_ctx->model_width * app_ctx->model_height * app_ctx->model_channel * sizeof(float);
+    // inputs[0].buf   = img.virt_addr;
+    inputs[0].buf = malloc(inputs[0].size);
+    memcpy(inputs[0].buf, img_M.data, inputs[0].size);
+
+    ret = rknn_inputs_set(app_ctx->rknn_ctx, 1, inputs);
+    if (ret < 0) {
+        printf("rknn_input_set fail! ret=%d\n", ret);
+        return -1;
+    }
+
+    // Run
+    // printf("rknn_run\n");
+    ret = rknn_run(app_ctx->rknn_ctx, nullptr);
+    if (ret < 0) {
+        printf("rknn_run fail! ret=%d\n", ret);
+        return -1;
+    }
+
+    // Get Output
+    int out_len_seq = app_ctx->model_width / 8;
+    outputs[0].want_float = 1;
+    ret = rknn_outputs_get(app_ctx->rknn_ctx, 1, outputs, NULL);
+    if (ret < 0) {
+        printf("rknn_outputs_get fail! ret=%d\n", ret);
+        goto out;
+    }
+
+    // Post Process
+    ret = rec_postprocess((float*)outputs[0].buf, MODEL_OUT_CHANNEL, out_len_seq, out_result);
+    
+    // Remeber to release rknn output
+    rknn_outputs_release(app_ctx->rknn_ctx, 1, outputs);
+
+out:
+    if (inputs[0].buf != NULL) {
+        free(inputs[0].buf);
+    }
+
+    return ret;
+}
+
+int inference_ppocr_system_model(ppocr_system_app_context* sys_app_ctx, image_buffer_t* src_img, ppocr_det_postprocess_params* params, ppocr_text_recog_array_result_t* out_result)
+{
+    int ret;
+    // Detect Text
+    ppocr_det_result det_results;
+    ret = inference_ppocr_det_model(&sys_app_ctx->det_context, src_img, params, &det_results);
+    if (ret != 0) {
+        printf("inference_ppocr_det_model fail! ret=%d\n", ret);
+        return -1;
+    }
+
+    // Recogize Text
+    out_result->count = 0;
+    if (det_results.count == 0) {           // detect nothing
+        return 0;
+    }
+
+    // boxes to boxes_result
+    std::vector<std::array<int, 8>> boxes_result;
+    for (int i=0; i < det_results.count; i++) {
+        std::array<int, 8> new_box;
+        new_box[0] = det_results.box[i].left_top.x;
+        new_box[1] = det_results.box[i].left_top.y;
+        new_box[2] = det_results.box[i].right_top.x;
+        new_box[3] = det_results.box[i].right_top.y;
+        new_box[4] = det_results.box[i].right_bottom.x;
+        new_box[5] = det_results.box[i].right_bottom.y;
+        new_box[6] = det_results.box[i].left_bottom.x;
+        new_box[7] = det_results.box[i].left_bottom.y;
+        boxes_result.emplace_back(new_box);
+    }
+
+    // Sort text boxes in order from top to bottom, left to right for speeding up
+    SortBoxes(&boxes_result);
+
+    // text recognize
+    for (int i=0; i < boxes_result.size(); i++) {
+        cv::Mat in_image = cv::Mat(src_img->height, src_img->width, CV_8UC3,(uint8_t*)src_img->virt_addr);
+        cv::Mat crop_image = GetRotateCropImage(in_image, boxes_result[i]);
+        image_buffer_t text_img;
+        memset(&text_img, 0, sizeof(image_buffer_t));
+        text_img.width = crop_image.cols;
+        text_img.height = crop_image.rows;
+        text_img.format = IMAGE_FORMAT_RGB888;
+        text_img.size = get_image_size(&text_img);
+        text_img.virt_addr = (unsigned char*)malloc(text_img.size);
+        if (text_img.virt_addr == NULL) {
+            printf("malloc buffer size:%d fail!\n", text_img.size);
+            return -1;
+        }
+        memcpy((void *)text_img.virt_addr, crop_image.data, text_img.size);
+        
+        ppocr_rec_result text_result;
+        text_result.score = 1.0;
+        ret = inference_ppocr_rec_model(&sys_app_ctx->rec_context, &text_img, &text_result);
+        if (ret != 0) {
+            printf("inference_ppocr_rec_model fail! ret=%d\n", ret);
+            return -1;
+        }
+        if (text_img.virt_addr != NULL) {
+            free(text_img.virt_addr);
+        }
+
+        if (text_result.score < TEXT_SCORE) {
+            continue;
+        }
+        out_result->text_result[out_result->count].box.left_top.x = boxes_result[i][0];
+        out_result->text_result[out_result->count].box.left_top.y = boxes_result[i][1];
+        out_result->text_result[out_result->count].box.right_top.x = boxes_result[i][2];
+        out_result->text_result[out_result->count].box.right_top.y = boxes_result[i][3];
+        out_result->text_result[out_result->count].box.right_bottom.x = boxes_result[i][4];
+        out_result->text_result[out_result->count].box.right_bottom.y = boxes_result[i][5];
+        out_result->text_result[out_result->count].box.left_bottom.x = boxes_result[i][6];
+        out_result->text_result[out_result->count].box.left_bottom.y = boxes_result[i][7];
+        out_result->text_result[out_result->count].text = text_result;
+        out_result->count ++;
+    }
+
+    return ret;
+}
\ No newline at end of file
diff --git a/examples/PPOCR/PPOCR-System/cpp/rknpu2/ppocr_system.cc b/examples/PPOCR/PPOCR-System/cpp/rknpu2/ppocr_system.cc
index 7da65cf..f7677e5 100644
--- a/examples/PPOCR/PPOCR-System/cpp/rknpu2/ppocr_system.cc
+++ b/examples/PPOCR/PPOCR-System/cpp/rknpu2/ppocr_system.cc
@@ -200,10 +200,6 @@ int init_ppocr_model(const char* model_path, rknn_app_context_t* app_ctx)
 
 int release_ppocr_model(rknn_app_context_t* app_ctx)
 {
-    if (app_ctx->rknn_ctx != 0) {
-        rknn_destroy(app_ctx->rknn_ctx);
-        app_ctx->rknn_ctx = 0;
-    }
     if (app_ctx->input_attrs != NULL) {
         free(app_ctx->input_attrs);
         app_ctx->input_attrs = NULL;
@@ -212,6 +208,10 @@ int release_ppocr_model(rknn_app_context_t* app_ctx)
         free(app_ctx->output_attrs);
         app_ctx->output_attrs = NULL;
     }
+    if (app_ctx->rknn_ctx != 0) {
+        rknn_destroy(app_ctx->rknn_ctx);
+        app_ctx->rknn_ctx = 0;
+    }
     return 0;
 }
 
diff --git a/examples/PPOCR/PPOCR-System/model/ppocr_keys_v1.txt b/examples/PPOCR/PPOCR-System/model/ppocr_keys_v1.txt
new file mode 100644
index 0000000..84b885d
--- /dev/null
+++ b/examples/PPOCR/PPOCR-System/model/ppocr_keys_v1.txt
@@ -0,0 +1,6623 @@
+'
+疗
+绚
+诚
+娇
+溜
+题
+贿
+者
+廖
+更
+纳
+加
+奉
+公
+一
+就
+汴
+计
+与
+路
+房
+原
+妇
+2
+0
+8
+-
+7
+其
+>
+:
+]
+,
+，
+骑
+刈
+全
+消
+昏
+傈
+安
+久
+钟
+嗅
+不
+影
+处
+驽
+蜿
+资
+关
+椤
+地
+瘸
+专
+问
+忖
+票
+嫉
+炎
+韵
+要
+月
+田
+节
+陂
+鄙
+捌
+备
+拳
+伺
+眼
+网
+盎
+大
+傍
+心
+东
+愉
+汇
+蹿
+科
+每
+业
+里
+航
+晏
+字
+平
+录
+先
+1
+3
+彤
+鲶
+产
+稍
+督
+腴
+有
+象
+岳
+注
+绍
+在
+泺
+文
+定
+核
+名
+水
+过
+理
+让
+偷
+率
+等
+这
+发
+”
+为
+含
+肥
+酉
+相
+鄱
+七
+编
+猥
+锛
+日
+镀
+蒂
+掰
+倒
+辆
+栾
+栗
+综
+涩
+州
+雌
+滑
+馀
+了
+机
+块
+司
+宰
+甙
+兴
+矽
+抚
+保
+用
+沧
+秩
+如
+收
+息
+滥
+页
+疑
+埠
+!
+！
+姥
+异
+橹
+钇
+向
+下
+跄
+的
+椴
+沫
+国
+绥
+獠
+报
+开
+民
+蜇
+何
+分
+凇
+长
+讥
+藏
+掏
+施
+羽
+中
+讲
+派
+嘟
+人
+提
+浼
+间
+世
+而
+古
+多
+倪
+唇
+饯
+控
+庚
+首
+赛
+蜓
+味
+断
+制
+觉
+技
+替
+艰
+溢
+潮
+夕
+钺
+外
+摘
+枋
+动
+双
+单
+啮
+户
+枇
+确
+锦
+曜
+杜
+或
+能
+效
+霜
+盒
+然
+侗
+电
+晁
+放
+步
+鹃
+新
+杖
+蜂
+吒
+濂
+瞬
+评
+总
+隍
+对
+独
+合
+也
+是
+府
+青
+天
+诲
+墙
+组
+滴
+级
+邀
+帘
+示
+已
+时
+骸
+仄
+泅
+和
+遨
+店
+雇
+疫
+持
+巍
+踮
+境
+只
+亨
+目
+鉴
+崤
+闲
+体
+泄
+杂
+作
+般
+轰
+化
+解
+迂
+诿
+蛭
+璀
+腾
+告
+版
+服
+省
+师
+小
+规
+程
+线
+海
+办
+引
+二
+桧
+牌
+砺
+洄
+裴
+修
+图
+痫
+胡
+许
+犊
+事
+郛
+基
+柴
+呼
+食
+研
+奶
+律
+蛋
+因
+葆
+察
+戏
+褒
+戒
+再
+李
+骁
+工
+貂
+油
+鹅
+章
+啄
+休
+场
+给
+睡
+纷
+豆
+器
+捎
+说
+敏
+学
+会
+浒
+设
+诊
+格
+廓
+查
+来
+霓
+室
+溆
+￠
+诡
+寥
+焕
+舜
+柒
+狐
+回
+戟
+砾
+厄
+实
+翩
+尿
+五
+入
+径
+惭
+喹
+股
+宇
+篝
+|
+;
+美
+期
+云
+九
+祺
+扮
+靠
+锝
+槌
+系
+企
+酰
+阊
+暂
+蚕
+忻
+豁
+本
+羹
+执
+条
+钦
+H
+獒
+限
+进
+季
+楦
+于
+芘
+玖
+铋
+茯
+未
+答
+粘
+括
+样
+精
+欠
+矢
+甥
+帷
+嵩
+扣
+令
+仔
+风
+皈
+行
+支
+部
+蓉
+刮
+站
+蜡
+救
+钊
+汗
+松
+嫌
+成
+可
+.
+鹤
+院
+从
+交
+政
+怕
+活
+调
+球
+局
+验
+髌
+第
+韫
+谗
+串
+到
+圆
+年
+米
+/
+*
+友
+忿
+检
+区
+看
+自
+敢
+刃
+个
+兹
+弄
+流
+留
+同
+没
+齿
+星
+聆
+轼
+湖
+什
+三
+建
+蛔
+儿
+椋
+汕
+震
+颧
+鲤
+跟
+力
+情
+璺
+铨
+陪
+务
+指
+族
+训
+滦
+鄣
+濮
+扒
+商
+箱
+十
+召
+慷
+辗
+所
+莞
+管
+护
+臭
+横
+硒
+嗓
+接
+侦
+六
+露
+党
+馋
+驾
+剖
+高
+侬
+妪
+幂
+猗
+绺
+骐
+央
+酐
+孝
+筝
+课
+徇
+缰
+门
+男
+西
+项
+句
+谙
+瞒
+秃
+篇
+教
+碲
+罚
+声
+呐
+景
+前
+富
+嘴
+鳌
+稀
+免
+朋
+啬
+睐
+去
+赈
+鱼
+住
+肩
+愕
+速
+旁
+波
+厅
+健
+茼
+厥
+鲟
+谅
+投
+攸
+炔
+数
+方
+击
+呋
+谈
+绩
+别
+愫
+僚
+躬
+鹧
+胪
+炳
+招
+喇
+膨
+泵
+蹦
+毛
+结
+5
+4
+谱
+识
+陕
+粽
+婚
+拟
+构
+且
+搜
+任
+潘
+比
+郢
+妨
+醪
+陀
+桔
+碘
+扎
+选
+哈
+骷
+楷
+亿
+明
+缆
+脯
+监
+睫
+逻
+婵
+共
+赴
+淝
+凡
+惦
+及
+达
+揖
+谩
+澹
+减
+焰
+蛹
+番
+祁
+柏
+员
+禄
+怡
+峤
+龙
+白
+叽
+生
+闯
+起
+细
+装
+谕
+竟
+聚
+钙
+上
+导
+渊
+按
+艾
+辘
+挡
+耒
+盹
+饪
+臀
+记
+邮
+蕙
+受
+各
+医
+搂
+普
+滇
+朗
+茸
+带
+翻
+酚
+(
+光
+堤
+墟
+蔷
+万
+幻
+〓
+瑙
+辈
+昧
+盏
+亘
+蛀
+吉
+铰
+请
+子
+假
+闻
+税
+井
+诩
+哨
+嫂
+好
+面
+琐
+校
+馊
+鬣
+缂
+营
+访
+炖
+占
+农
+缀
+否
+经
+钚
+棵
+趟
+张
+亟
+吏
+茶
+谨
+捻
+论
+迸
+堂
+玉
+信
+吧
+瞠
+乡
+姬
+寺
+咬
+溏
+苄
+皿
+意
+赉
+宝
+尔
+钰
+艺
+特
+唳
+踉
+都
+荣
+倚
+登
+荐
+丧
+奇
+涵
+批
+炭
+近
+符
+傩
+感
+道
+着
+菊
+虹
+仲
+众
+懈
+濯
+颞
+眺
+南
+释
+北
+缝
+标
+既
+茗
+整
+撼
+迤
+贲
+挎
+耱
+拒
+某
+妍
+卫
+哇
+英
+矶
+藩
+治
+他
+元
+领
+膜
+遮
+穗
+蛾
+飞
+荒
+棺
+劫
+么
+市
+火
+温
+拈
+棚
+洼
+转
+果
+奕
+卸
+迪
+伸
+泳
+斗
+邡
+侄
+涨
+屯
+萋
+胭
+氡
+崮
+枞
+惧
+冒
+彩
+斜
+手
+豚
+随
+旭
+淑
+妞
+形
+菌
+吲
+沱
+争
+驯
+歹
+挟
+兆
+柱
+传
+至
+包
+内
+响
+临
+红
+功
+弩
+衡
+寂
+禁
+老
+棍
+耆
+渍
+织
+害
+氵
+渑
+布
+载
+靥
+嗬
+虽
+苹
+咨
+娄
+库
+雉
+榜
+帜
+嘲
+套
+瑚
+亲
+簸
+欧
+边
+6
+腿
+旮
+抛
+吹
+瞳
+得
+镓
+梗
+厨
+继
+漾
+愣
+憨
+士
+策
+窑
+抑
+躯
+襟
+脏
+参
+贸
+言
+干
+绸
+鳄
+穷
+藜
+音
+折
+详
+)
+举
+悍
+甸
+癌
+黎
+谴
+死
+罩
+迁
+寒
+驷
+袖
+媒
+蒋
+掘
+模
+纠
+恣
+观
+祖
+蛆
+碍
+位
+稿
+主
+澧
+跌
+筏
+京
+锏
+帝
+贴
+证
+糠
+才
+黄
+鲸
+略
+炯
+饱
+四
+出
+园
+犀
+牧
+容
+汉
+杆
+浈
+汰
+瑷
+造
+虫
+瘩
+怪
+驴
+济
+应
+花
+沣
+谔
+夙
+旅
+价
+矿
+以
+考
+s
+u
+呦
+晒
+巡
+茅
+准
+肟
+瓴
+詹
+仟
+褂
+译
+桌
+混
+宁
+怦
+郑
+抿
+些
+余
+鄂
+饴
+攒
+珑
+群
+阖
+岔
+琨
+藓
+预
+环
+洮
+岌
+宀
+杲
+瀵
+最
+常
+囡
+周
+踊
+女
+鼓
+袭
+喉
+简
+范
+薯
+遐
+疏
+粱
+黜
+禧
+法
+箔
+斤
+遥
+汝
+奥
+直
+贞
+撑
+置
+绱
+集
+她
+馅
+逗
+钧
+橱
+魉
+[
+恙
+躁
+唤
+9
+旺
+膘
+待
+脾
+惫
+购
+吗
+依
+盲
+度
+瘿
+蠖
+俾
+之
+镗
+拇
+鲵
+厝
+簧
+续
+款
+展
+啃
+表
+剔
+品
+钻
+腭
+损
+清
+锶
+统
+涌
+寸
+滨
+贪
+链
+吠
+冈
+伎
+迥
+咏
+吁
+览
+防
+迅
+失
+汾
+阔
+逵
+绀
+蔑
+列
+川
+凭
+努
+熨
+揪
+利
+俱
+绉
+抢
+鸨
+我
+即
+责
+膦
+易
+毓
+鹊
+刹
+玷
+岿
+空
+嘞
+绊
+排
+术
+估
+锷
+违
+们
+苟
+铜
+播
+肘
+件
+烫
+审
+鲂
+广
+像
+铌
+惰
+铟
+巳
+胍
+鲍
+康
+憧
+色
+恢
+想
+拷
+尤
+疳
+知
+S
+Y
+F
+D
+A
+峄
+裕
+帮
+握
+搔
+氐
+氘
+难
+墒
+沮
+雨
+叁
+缥
+悴
+藐
+湫
+娟
+苑
+稠
+颛
+簇
+后
+阕
+闭
+蕤
+缚
+怎
+佞
+码
+嘤
+蔡
+痊
+舱
+螯
+帕
+赫
+昵
+升
+烬
+岫
+、
+疵
+蜻
+髁
+蕨
+隶
+烛
+械
+丑
+盂
+梁
+强
+鲛
+由
+拘
+揉
+劭
+龟
+撤
+钩
+呕
+孛
+费
+妻
+漂
+求
+阑
+崖
+秤
+甘
+通
+深
+补
+赃
+坎
+床
+啪
+承
+吼
+量
+暇
+钼
+烨
+阂
+擎
+脱
+逮
+称
+P
+神
+属
+矗
+华
+届
+狍
+葑
+汹
+育
+患
+窒
+蛰
+佼
+静
+槎
+运
+鳗
+庆
+逝
+曼
+疱
+克
+代
+官
+此
+麸
+耧
+蚌
+晟
+例
+础
+榛
+副
+测
+唰
+缢
+迹
+灬
+霁
+身
+岁
+赭
+扛
+又
+菡
+乜
+雾
+板
+读
+陷
+徉
+贯
+郁
+虑
+变
+钓
+菜
+圾
+现
+琢
+式
+乐
+维
+渔
+浜
+左
+吾
+脑
+钡
+警
+T
+啵
+拴
+偌
+漱
+湿
+硕
+止
+骼
+魄
+积
+燥
+联
+踢
+玛
+则
+窿
+见
+振
+畿
+送
+班
+钽
+您
+赵
+刨
+印
+讨
+踝
+籍
+谡
+舌
+崧
+汽
+蔽
+沪
+酥
+绒
+怖
+财
+帖
+肱
+私
+莎
+勋
+羔
+霸
+励
+哼
+帐
+将
+帅
+渠
+纪
+婴
+娩
+岭
+厘
+滕
+吻
+伤
+坝
+冠
+戊
+隆
+瘁
+介
+涧
+物
+黍
+并
+姗
+奢
+蹑
+掣
+垸
+锴
+命
+箍
+捉
+病
+辖
+琰
+眭
+迩
+艘
+绌
+繁
+寅
+若
+毋
+思
+诉
+类
+诈
+燮
+轲
+酮
+狂
+重
+反
+职
+筱
+县
+委
+磕
+绣
+奖
+晋
+濉
+志
+徽
+肠
+呈
+獐
+坻
+口
+片
+碰
+几
+村
+柿
+劳
+料
+获
+亩
+惕
+晕
+厌
+号
+罢
+池
+正
+鏖
+煨
+家
+棕
+复
+尝
+懋
+蜥
+锅
+岛
+扰
+队
+坠
+瘾
+钬
+@
+卧
+疣
+镇
+譬
+冰
+彷
+频
+黯
+据
+垄
+采
+八
+缪
+瘫
+型
+熹
+砰
+楠
+襁
+箐
+但
+嘶
+绳
+啤
+拍
+盥
+穆
+傲
+洗
+盯
+塘
+怔
+筛
+丿
+台
+恒
+喂
+葛
+永
+￥
+烟
+酒
+桦
+书
+砂
+蚝
+缉
+态
+瀚
+袄
+圳
+轻
+蛛
+超
+榧
+遛
+姒
+奘
+铮
+右
+荽
+望
+偻
+卡
+丶
+氰
+附
+做
+革
+索
+戚
+坨
+桷
+唁
+垅
+榻
+岐
+偎
+坛
+莨
+山
+殊
+微
+骇
+陈
+爨
+推
+嗝
+驹
+澡
+藁
+呤
+卤
+嘻
+糅
+逛
+侵
+郓
+酌
+德
+摇
+※
+鬃
+被
+慨
+殡
+羸
+昌
+泡
+戛
+鞋
+河
+宪
+沿
+玲
+鲨
+翅
+哽
+源
+铅
+语
+照
+邯
+址
+荃
+佬
+顺
+鸳
+町
+霭
+睾
+瓢
+夸
+椁
+晓
+酿
+痈
+咔
+侏
+券
+噎
+湍
+签
+嚷
+离
+午
+尚
+社
+锤
+背
+孟
+使
+浪
+缦
+潍
+鞅
+军
+姹
+驶
+笑
+鳟
+鲁
+》
+孽
+钜
+绿
+洱
+礴
+焯
+椰
+颖
+囔
+乌
+孔
+巴
+互
+性
+椽
+哞
+聘
+昨
+早
+暮
+胶
+炀
+隧
+低
+彗
+昝
+铁
+呓
+氽
+藉
+喔
+癖
+瑗
+姨
+权
+胱
+韦
+堑
+蜜
+酋
+楝
+砝
+毁
+靓
+歙
+锲
+究
+屋
+喳
+骨
+辨
+碑
+武
+鸠
+宫
+辜
+烊
+适
+坡
+殃
+培
+佩
+供
+走
+蜈
+迟
+翼
+况
+姣
+凛
+浔
+吃
+飘
+债
+犟
+金
+促
+苛
+崇
+坂
+莳
+畔
+绂
+兵
+蠕
+斋
+根
+砍
+亢
+欢
+恬
+崔
+剁
+餐
+榫
+快
+扶
+‖
+濒
+缠
+鳜
+当
+彭
+驭
+浦
+篮
+昀
+锆
+秸
+钳
+弋
+娣
+瞑
+夷
+龛
+苫
+拱
+致
+%
+嵊
+障
+隐
+弑
+初
+娓
+抉
+汩
+累
+蓖
+"
+唬
+助
+苓
+昙
+押
+毙
+破
+城
+郧
+逢
+嚏
+獭
+瞻
+溱
+婿
+赊
+跨
+恼
+璧
+萃
+姻
+貉
+灵
+炉
+密
+氛
+陶
+砸
+谬
+衔
+点
+琛
+沛
+枳
+层
+岱
+诺
+脍
+榈
+埂
+征
+冷
+裁
+打
+蹴
+素
+瘘
+逞
+蛐
+聊
+激
+腱
+萘
+踵
+飒
+蓟
+吆
+取
+咙
+簋
+涓
+矩
+曝
+挺
+揣
+座
+你
+史
+舵
+焱
+尘
+苏
+笈
+脚
+溉
+榨
+诵
+樊
+邓
+焊
+义
+庶
+儋
+蟋
+蒲
+赦
+呷
+杞
+诠
+豪
+还
+试
+颓
+茉
+太
+除
+紫
+逃
+痴
+草
+充
+鳕
+珉
+祗
+墨
+渭
+烩
+蘸
+慕
+璇
+镶
+穴
+嵘
+恶
+骂
+险
+绋
+幕
+碉
+肺
+戳
+刘
+潞
+秣
+纾
+潜
+銮
+洛
+须
+罘
+销
+瘪
+汞
+兮
+屉
+r
+林
+厕
+质
+探
+划
+狸
+殚
+善
+煊
+烹
+〒
+锈
+逯
+宸
+辍
+泱
+柚
+袍
+远
+蹋
+嶙
+绝
+峥
+娥
+缍
+雀
+徵
+认
+镱
+谷
+=
+贩
+勉
+撩
+鄯
+斐
+洋
+非
+祚
+泾
+诒
+饿
+撬
+威
+晷
+搭
+芍
+锥
+笺
+蓦
+候
+琊
+档
+礁
+沼
+卵
+荠
+忑
+朝
+凹
+瑞
+头
+仪
+弧
+孵
+畏
+铆
+突
+衲
+车
+浩
+气
+茂
+悖
+厢
+枕
+酝
+戴
+湾
+邹
+飚
+攘
+锂
+写
+宵
+翁
+岷
+无
+喜
+丈
+挑
+嗟
+绛
+殉
+议
+槽
+具
+醇
+淞
+笃
+郴
+阅
+饼
+底
+壕
+砚
+弈
+询
+缕
+庹
+翟
+零
+筷
+暨
+舟
+闺
+甯
+撞
+麂
+茌
+蔼
+很
+珲
+捕
+棠
+角
+阉
+媛
+娲
+诽
+剿
+尉
+爵
+睬
+韩
+诰
+匣
+危
+糍
+镯
+立
+浏
+阳
+少
+盆
+舔
+擘
+匪
+申
+尬
+铣
+旯
+抖
+赘
+瓯
+居
+ˇ
+哮
+游
+锭
+茏
+歌
+坏
+甚
+秒
+舞
+沙
+仗
+劲
+潺
+阿
+燧
+郭
+嗖
+霏
+忠
+材
+奂
+耐
+跺
+砀
+输
+岖
+媳
+氟
+极
+摆
+灿
+今
+扔
+腻
+枝
+奎
+药
+熄
+吨
+话
+q
+额
+慑
+嘌
+协
+喀
+壳
+埭
+视
+著
+於
+愧
+陲
+翌
+峁
+颅
+佛
+腹
+聋
+侯
+咎
+叟
+秀
+颇
+存
+较
+罪
+哄
+岗
+扫
+栏
+钾
+羌
+己
+璨
+枭
+霉
+煌
+涸
+衿
+键
+镝
+益
+岢
+奏
+连
+夯
+睿
+冥
+均
+糖
+狞
+蹊
+稻
+爸
+刿
+胥
+煜
+丽
+肿
+璃
+掸
+跚
+灾
+垂
+樾
+濑
+乎
+莲
+窄
+犹
+撮
+战
+馄
+软
+络
+显
+鸢
+胸
+宾
+妲
+恕
+埔
+蝌
+份
+遇
+巧
+瞟
+粒
+恰
+剥
+桡
+博
+讯
+凯
+堇
+阶
+滤
+卖
+斌
+骚
+彬
+兑
+磺
+樱
+舷
+两
+娱
+福
+仃
+差
+找
+桁
+÷
+净
+把
+阴
+污
+戬
+雷
+碓
+蕲
+楚
+罡
+焖
+抽
+妫
+咒
+仑
+闱
+尽
+邑
+菁
+爱
+贷
+沥
+鞑
+牡
+嗉
+崴
+骤
+塌
+嗦
+订
+拮
+滓
+捡
+锻
+次
+坪
+杩
+臃
+箬
+融
+珂
+鹗
+宗
+枚
+降
+鸬
+妯
+阄
+堰
+盐
+毅
+必
+杨
+崃
+俺
+甬
+状
+莘
+货
+耸
+菱
+腼
+铸
+唏
+痤
+孚
+澳
+懒
+溅
+翘
+疙
+杷
+淼
+缙
+骰
+喊
+悉
+砻
+坷
+艇
+赁
+界
+谤
+纣
+宴
+晃
+茹
+归
+饭
+梢
+铡
+街
+抄
+肼
+鬟
+苯
+颂
+撷
+戈
+炒
+咆
+茭
+瘙
+负
+仰
+客
+琉
+铢
+封
+卑
+珥
+椿
+镧
+窨
+鬲
+寿
+御
+袤
+铃
+萎
+砖
+餮
+脒
+裳
+肪
+孕
+嫣
+馗
+嵇
+恳
+氯
+江
+石
+褶
+冢
+祸
+阻
+狈
+羞
+银
+靳
+透
+咳
+叼
+敷
+芷
+啥
+它
+瓤
+兰
+痘
+懊
+逑
+肌
+往
+捺
+坊
+甩
+呻
+〃
+沦
+忘
+膻
+祟
+菅
+剧
+崆
+智
+坯
+臧
+霍
+墅
+攻
+眯
+倘
+拢
+骠
+铐
+庭
+岙
+瓠
+′
+缺
+泥
+迢
+捶
+?
+？
+郏
+喙
+掷
+沌
+纯
+秘
+种
+听
+绘
+固
+螨
+团
+香
+盗
+妒
+埚
+蓝
+拖
+旱
+荞
+铀
+血
+遏
+汲
+辰
+叩
+拽
+幅
+硬
+惶
+桀
+漠
+措
+泼
+唑
+齐
+肾
+念
+酱
+虚
+屁
+耶
+旗
+砦
+闵
+婉
+馆
+拭
+绅
+韧
+忏
+窝
+醋
+葺
+顾
+辞
+倜
+堆
+辋
+逆
+玟
+贱
+疾
+董
+惘
+倌
+锕
+淘
+嘀
+莽
+俭
+笏
+绑
+鲷
+杈
+择
+蟀
+粥
+嗯
+驰
+逾
+案
+谪
+褓
+胫
+哩
+昕
+颚
+鲢
+绠
+躺
+鹄
+崂
+儒
+俨
+丝
+尕
+泌
+啊
+萸
+彰
+幺
+吟
+骄
+苣
+弦
+脊
+瑰
+〈
+诛
+镁
+析
+闪
+剪
+侧
+哟
+框
+螃
+守
+嬗
+燕
+狭
+铈
+缮
+概
+迳
+痧
+鲲
+俯
+售
+笼
+痣
+扉
+挖
+满
+咋
+援
+邱
+扇
+歪
+便
+玑
+绦
+峡
+蛇
+叨
+〖
+泽
+胃
+斓
+喋
+怂
+坟
+猪
+该
+蚬
+炕
+弥
+赞
+棣
+晔
+娠
+挲
+狡
+创
+疖
+铕
+镭
+稷
+挫
+弭
+啾
+翔
+粉
+履
+苘
+哦
+楼
+秕
+铂
+土
+锣
+瘟
+挣
+栉
+习
+享
+桢
+袅
+磨
+桂
+谦
+延
+坚
+蔚
+噗
+署
+谟
+猬
+钎
+恐
+嬉
+雒
+倦
+衅
+亏
+璩
+睹
+刻
+殿
+王
+算
+雕
+麻
+丘
+柯
+骆
+丸
+塍
+谚
+添
+鲈
+垓
+桎
+蚯
+芥
+予
+飕
+镦
+谌
+窗
+醚
+菀
+亮
+搪
+莺
+蒿
+羁
+足
+J
+真
+轶
+悬
+衷
+靛
+翊
+掩
+哒
+炅
+掐
+冼
+妮
+l
+谐
+稚
+荆
+擒
+犯
+陵
+虏
+浓
+崽
+刍
+陌
+傻
+孜
+千
+靖
+演
+矜
+钕
+煽
+杰
+酗
+渗
+伞
+栋
+俗
+泫
+戍
+罕
+沾
+疽
+灏
+煦
+芬
+磴
+叱
+阱
+榉
+湃
+蜀
+叉
+醒
+彪
+租
+郡
+篷
+屎
+良
+垢
+隗
+弱
+陨
+峪
+砷
+掴
+颁
+胎
+雯
+绵
+贬
+沐
+撵
+隘
+篙
+暖
+曹
+陡
+栓
+填
+臼
+彦
+瓶
+琪
+潼
+哪
+鸡
+摩
+啦
+俟
+锋
+域
+耻
+蔫
+疯
+纹
+撇
+毒
+绶
+痛
+酯
+忍
+爪
+赳
+歆
+嘹
+辕
+烈
+册
+朴
+钱
+吮
+毯
+癜
+娃
+谀
+邵
+厮
+炽
+璞
+邃
+丐
+追
+词
+瓒
+忆
+轧
+芫
+谯
+喷
+弟
+半
+冕
+裙
+掖
+墉
+绮
+寝
+苔
+势
+顷
+褥
+切
+衮
+君
+佳
+嫒
+蚩
+霞
+佚
+洙
+逊
+镖
+暹
+唛
+&
+殒
+顶
+碗
+獗
+轭
+铺
+蛊
+废
+恹
+汨
+崩
+珍
+那
+杵
+曲
+纺
+夏
+薰
+傀
+闳
+淬
+姘
+舀
+拧
+卷
+楂
+恍
+讪
+厩
+寮
+篪
+赓
+乘
+灭
+盅
+鞣
+沟
+慎
+挂
+饺
+鼾
+杳
+树
+缨
+丛
+絮
+娌
+臻
+嗳
+篡
+侩
+述
+衰
+矛
+圈
+蚜
+匕
+筹
+匿
+濞
+晨
+叶
+骋
+郝
+挚
+蚴
+滞
+增
+侍
+描
+瓣
+吖
+嫦
+蟒
+匾
+圣
+赌
+毡
+癞
+恺
+百
+曳
+需
+篓
+肮
+庖
+帏
+卿
+驿
+遗
+蹬
+鬓
+骡
+歉
+芎
+胳
+屐
+禽
+烦
+晌
+寄
+媾
+狄
+翡
+苒
+船
+廉
+终
+痞
+殇
+々
+畦
+饶
+改
+拆
+悻
+萄
+￡
+瓿
+乃
+訾
+桅
+匮
+溧
+拥
+纱
+铍
+骗
+蕃
+龋
+缬
+父
+佐
+疚
+栎
+醍
+掳
+蓄
+x
+惆
+颜
+鲆
+榆
+〔
+猎
+敌
+暴
+谥
+鲫
+贾
+罗
+玻
+缄
+扦
+芪
+癣
+落
+徒
+臾
+恿
+猩
+托
+邴
+肄
+牵
+春
+陛
+耀
+刊
+拓
+蓓
+邳
+堕
+寇
+枉
+淌
+啡
+湄
+兽
+酷
+萼
+碚
+濠
+萤
+夹
+旬
+戮
+梭
+琥
+椭
+昔
+勺
+蜊
+绐
+晚
+孺
+僵
+宣
+摄
+冽
+旨
+萌
+忙
+蚤
+眉
+噼
+蟑
+付
+契
+瓜
+悼
+颡
+壁
+曾
+窕
+颢
+澎
+仿
+俑
+浑
+嵌
+浣
+乍
+碌
+褪
+乱
+蔟
+隙
+玩
+剐
+葫
+箫
+纲
+围
+伐
+决
+伙
+漩
+瑟
+刑
+肓
+镳
+缓
+蹭
+氨
+皓
+典
+畲
+坍
+铑
+檐
+塑
+洞
+倬
+储
+胴
+淳
+戾
+吐
+灼
+惺
+妙
+毕
+珐
+缈
+虱
+盖
+羰
+鸿
+磅
+谓
+髅
+娴
+苴
+唷
+蚣
+霹
+抨
+贤
+唠
+犬
+誓
+逍
+庠
+逼
+麓
+籼
+釉
+呜
+碧
+秧
+氩
+摔
+霄
+穸
+纨
+辟
+妈
+映
+完
+牛
+缴
+嗷
+炊
+恩
+荔
+茆
+掉
+紊
+慌
+莓
+羟
+阙
+萁
+磐
+另
+蕹
+辱
+鳐
+湮
+吡
+吩
+唐
+睦
+垠
+舒
+圜
+冗
+瞿
+溺
+芾
+囱
+匠
+僳
+汐
+菩
+饬
+漓
+黑
+霰
+浸
+濡
+窥
+毂
+蒡
+兢
+驻
+鹉
+芮
+诙
+迫
+雳
+厂
+忐
+臆
+猴
+鸣
+蚪
+栈
+箕
+羡
+渐
+莆
+捍
+眈
+哓
+趴
+蹼
+埕
+嚣
+骛
+宏
+淄
+斑
+噜
+严
+瑛
+垃
+椎
+诱
+压
+庾
+绞
+焘
+廿
+抡
+迄
+棘
+夫
+纬
+锹
+眨
+瞌
+侠
+脐
+竞
+瀑
+孳
+骧
+遁
+姜
+颦
+荪
+滚
+萦
+伪
+逸
+粳
+爬
+锁
+矣
+役
+趣
+洒
+颔
+诏
+逐
+奸
+甭
+惠
+攀
+蹄
+泛
+尼
+拼
+阮
+鹰
+亚
+颈
+惑
+勒
+〉
+际
+肛
+爷
+刚
+钨
+丰
+养
+冶
+鲽
+辉
+蔻
+画
+覆
+皴
+妊
+麦
+返
+醉
+皂
+擀
+〗
+酶
+凑
+粹
+悟
+诀
+硖
+港
+卜
+z
+杀
+涕
+±
+舍
+铠
+抵
+弛
+段
+敝
+镐
+奠
+拂
+轴
+跛
+袱
+e
+t
+沉
+菇
+俎
+薪
+峦
+秭
+蟹
+历
+盟
+菠
+寡
+液
+肢
+喻
+染
+裱
+悱
+抱
+氙
+赤
+捅
+猛
+跑
+氮
+谣
+仁
+尺
+辊
+窍
+烙
+衍
+架
+擦
+倏
+璐
+瑁
+币
+楞
+胖
+夔
+趸
+邛
+惴
+饕
+虔
+蝎
+§
+哉
+贝
+宽
+辫
+炮
+扩
+饲
+籽
+魏
+菟
+锰
+伍
+猝
+末
+琳
+哚
+蛎
+邂
+呀
+姿
+鄞
+却
+歧
+仙
+恸
+椐
+森
+牒
+寤
+袒
+婆
+虢
+雅
+钉
+朵
+贼
+欲
+苞
+寰
+故
+龚
+坭
+嘘
+咫
+礼
+硷
+兀
+睢
+汶
+’
+铲
+烧
+绕
+诃
+浃
+钿
+哺
+柜
+讼
+颊
+璁
+腔
+洽
+咐
+脲
+簌
+筠
+镣
+玮
+鞠
+谁
+兼
+姆
+挥
+梯
+蝴
+谘
+漕
+刷
+躏
+宦
+弼
+b
+垌
+劈
+麟
+莉
+揭
+笙
+渎
+仕
+嗤
+仓
+配
+怏
+抬
+错
+泯
+镊
+孰
+猿
+邪
+仍
+秋
+鼬
+壹
+歇
+吵
+炼
+<
+尧
+射
+柬
+廷
+胧
+霾
+凳
+隋
+肚
+浮
+梦
+祥
+株
+堵
+退
+L
+鹫
+跎
+凶
+毽
+荟
+炫
+栩
+玳
+甜
+沂
+鹿
+顽
+伯
+爹
+赔
+蛴
+徐
+匡
+欣
+狰
+缸
+雹
+蟆
+疤
+默
+沤
+啜
+痂
+衣
+禅
+w
+i
+h
+辽
+葳
+黝
+钗
+停
+沽
+棒
+馨
+颌
+肉
+吴
+硫
+悯
+劾
+娈
+马
+啧
+吊
+悌
+镑
+峭
+帆
+瀣
+涉
+咸
+疸
+滋
+泣
+翦
+拙
+癸
+钥
+蜒
++
+尾
+庄
+凝
+泉
+婢
+渴
+谊
+乞
+陆
+锉
+糊
+鸦
+淮
+I
+B
+N
+晦
+弗
+乔
+庥
+葡
+尻
+席
+橡
+傣
+渣
+拿
+惩
+麋
+斛
+缃
+矮
+蛏
+岘
+鸽
+姐
+膏
+催
+奔
+镒
+喱
+蠡
+摧
+钯
+胤
+柠
+拐
+璋
+鸥
+卢
+荡
+倾
+^
+_
+珀
+逄
+萧
+塾
+掇
+贮
+笆
+聂
+圃
+冲
+嵬
+M
+滔
+笕
+值
+炙
+偶
+蜱
+搐
+梆
+汪
+蔬
+腑
+鸯
+蹇
+敞
+绯
+仨
+祯
+谆
+梧
+糗
+鑫
+啸
+豺
+囹
+猾
+巢
+柄
+瀛
+筑
+踌
+沭
+暗
+苁
+鱿
+蹉
+脂
+蘖
+牢
+热
+木
+吸
+溃
+宠
+序
+泞
+偿
+拜
+檩
+厚
+朐
+毗
+螳
+吞
+媚
+朽
+担
+蝗
+橘
+畴
+祈
+糟
+盱
+隼
+郜
+惜
+珠
+裨
+铵
+焙
+琚
+唯
+咚
+噪
+骊
+丫
+滢
+勤
+棉
+呸
+咣
+淀
+隔
+蕾
+窈
+饨
+挨
+煅
+短
+匙
+粕
+镜
+赣
+撕
+墩
+酬
+馁
+豌
+颐
+抗
+酣
+氓
+佑
+搁
+哭
+递
+耷
+涡
+桃
+贻
+碣
+截
+瘦
+昭
+镌
+蔓
+氚
+甲
+猕
+蕴
+蓬
+散
+拾
+纛
+狼
+猷
+铎
+埋
+旖
+矾
+讳
+囊
+糜
+迈
+粟
+蚂
+紧
+鲳
+瘢
+栽
+稼
+羊
+锄
+斟
+睁
+桥
+瓮
+蹙
+祉
+醺
+鼻
+昱
+剃
+跳
+篱
+跷
+蒜
+翎
+宅
+晖
+嗑
+壑
+峻
+癫
+屏
+狠
+陋
+袜
+途
+憎
+祀
+莹
+滟
+佶
+溥
+臣
+约
+盛
+峰
+磁
+慵
+婪
+拦
+莅
+朕
+鹦
+粲
+裤
+哎
+疡
+嫖
+琵
+窟
+堪
+谛
+嘉
+儡
+鳝
+斩
+郾
+驸
+酊
+妄
+胜
+贺
+徙
+傅
+噌
+钢
+栅
+庇
+恋
+匝
+巯
+邈
+尸
+锚
+粗
+佟
+蛟
+薹
+纵
+蚊
+郅
+绢
+锐
+苗
+俞
+篆
+淆
+膀
+鲜
+煎
+诶
+秽
+寻
+涮
+刺
+怀
+噶
+巨
+褰
+魅
+灶
+灌
+桉
+藕
+谜
+舸
+薄
+搀
+恽
+借
+牯
+痉
+渥
+愿
+亓
+耘
+杠
+柩
+锔
+蚶
+钣
+珈
+喘
+蹒
+幽
+赐
+稗
+晤
+莱
+泔
+扯
+肯
+菪
+裆
+腩
+豉
+疆
+骜
+腐
+倭
+珏
+唔
+粮
+亡
+润
+慰
+伽
+橄
+玄
+誉
+醐
+胆
+龊
+粼
+塬
+陇
+彼
+削
+嗣
+绾
+芽
+妗
+垭
+瘴
+爽
+薏
+寨
+龈
+泠
+弹
+赢
+漪
+猫
+嘧
+涂
+恤
+圭
+茧
+烽
+屑
+痕
+巾
+赖
+荸
+凰
+腮
+畈
+亵
+蹲
+偃
+苇
+澜
+艮
+换
+骺
+烘
+苕
+梓
+颉
+肇
+哗
+悄
+氤
+涠
+葬
+屠
+鹭
+植
+竺
+佯
+诣
+鲇
+瘀
+鲅
+邦
+移
+滁
+冯
+耕
+癔
+戌
+茬
+沁
+巩
+悠
+湘
+洪
+痹
+锟
+循
+谋
+腕
+鳃
+钠
+捞
+焉
+迎
+碱
+伫
+急
+榷
+奈
+邝
+卯
+辄
+皲
+卟
+醛
+畹
+忧
+稳
+雄
+昼
+缩
+阈
+睑
+扌
+耗
+曦
+涅
+捏
+瞧
+邕
+淖
+漉
+铝
+耦
+禹
+湛
+喽
+莼
+琅
+诸
+苎
+纂
+硅
+始
+嗨
+傥
+燃
+臂
+赅
+嘈
+呆
+贵
+屹
+壮
+肋
+亍
+蚀
+卅
+豹
+腆
+邬
+迭
+浊
+}
+童
+螂
+捐
+圩
+勐
+触
+寞
+汊
+壤
+荫
+膺
+渌
+芳
+懿
+遴
+螈
+泰
+蓼
+蛤
+茜
+舅
+枫
+朔
+膝
+眙
+避
+梅
+判
+鹜
+璜
+牍
+缅
+垫
+藻
+黔
+侥
+惚
+懂
+踩
+腰
+腈
+札
+丞
+唾
+慈
+顿
+摹
+荻
+琬
+~
+斧
+沈
+滂
+胁
+胀
+幄
+莜
+Z
+匀
+鄄
+掌
+绰
+茎
+焚
+赋
+萱
+谑
+汁
+铒
+瞎
+夺
+蜗
+野
+娆
+冀
+弯
+篁
+懵
+灞
+隽
+芡
+脘
+俐
+辩
+芯
+掺
+喏
+膈
+蝈
+觐
+悚
+踹
+蔗
+熠
+鼠
+呵
+抓
+橼
+峨
+畜
+缔
+禾
+崭
+弃
+熊
+摒
+凸
+拗
+穹
+蒙
+抒
+祛
+劝
+闫
+扳
+阵
+醌
+踪
+喵
+侣
+搬
+仅
+荧
+赎
+蝾
+琦
+买
+婧
+瞄
+寓
+皎
+冻
+赝
+箩
+莫
+瞰
+郊
+笫
+姝
+筒
+枪
+遣
+煸
+袋
+舆
+痱
+涛
+母
+〇
+启
+践
+耙
+绲
+盘
+遂
+昊
+搞
+槿
+诬
+纰
+泓
+惨
+檬
+亻
+越
+C
+o
+憩
+熵
+祷
+钒
+暧
+塔
+阗
+胰
+咄
+娶
+魔
+琶
+钞
+邻
+扬
+杉
+殴
+咽
+弓
+〆
+髻
+】
+吭
+揽
+霆
+拄
+殖
+脆
+彻
+岩
+芝
+勃
+辣
+剌
+钝
+嘎
+甄
+佘
+皖
+伦
+授
+徕
+憔
+挪
+皇
+庞
+稔
+芜
+踏
+溴
+兖
+卒
+擢
+饥
+鳞
+煲
+‰
+账
+颗
+叻
+斯
+捧
+鳍
+琮
+讹
+蛙
+纽
+谭
+酸
+兔
+莒
+睇
+伟
+觑
+羲
+嗜
+宜
+褐
+旎
+辛
+卦
+诘
+筋
+鎏
+溪
+挛
+熔
+阜
+晰
+鳅
+丢
+奚
+灸
+呱
+献
+陉
+黛
+鸪
+甾
+萨
+疮
+拯
+洲
+疹
+辑
+叙
+恻
+谒
+允
+柔
+烂
+氏
+逅
+漆
+拎
+惋
+扈
+湟
+纭
+啕
+掬
+擞
+哥
+忽
+涤
+鸵
+靡
+郗
+瓷
+扁
+廊
+怨
+雏
+钮
+敦
+E
+懦
+憋
+汀
+拚
+啉
+腌
+岸
+f
+痼
+瞅
+尊
+咀
+眩
+飙
+忌
+仝
+迦
+熬
+毫
+胯
+篑
+茄
+腺
+凄
+舛
+碴
+锵
+诧
+羯
+後
+漏
+汤
+宓
+仞
+蚁
+壶
+谰
+皑
+铄
+棰
+罔
+辅
+晶
+苦
+牟
+闽
+\
+烃
+饮
+聿
+丙
+蛳
+朱
+煤
+涔
+鳖
+犁
+罐
+荼
+砒
+淦
+妤
+黏
+戎
+孑
+婕
+瑾
+戢
+钵
+枣
+捋
+砥
+衩
+狙
+桠
+稣
+阎
+肃
+梏
+诫
+孪
+昶
+婊
+衫
+嗔
+侃
+塞
+蜃
+樵
+峒
+貌
+屿
+欺
+缫
+阐
+栖
+诟
+珞
+荭
+吝
+萍
+嗽
+恂
+啻
+蜴
+磬
+峋
+俸
+豫
+谎
+徊
+镍
+韬
+魇
+晴
+U
+囟
+猜
+蛮
+坐
+囿
+伴
+亭
+肝
+佗
+蝠
+妃
+胞
+滩
+榴
+氖
+垩
+苋
+砣
+扪
+馏
+姓
+轩
+厉
+夥
+侈
+禀
+垒
+岑
+赏
+钛
+辐
+痔
+披
+纸
+碳
+“
+坞
+蠓
+挤
+荥
+沅
+悔
+铧
+帼
+蒌
+蝇
+a
+p
+y
+n
+g
+哀
+浆
+瑶
+凿
+桶
+馈
+皮
+奴
+苜
+佤
+伶
+晗
+铱
+炬
+优
+弊
+氢
+恃
+甫
+攥
+端
+锌
+灰
+稹
+炝
+曙
+邋
+亥
+眶
+碾
+拉
+萝
+绔
+捷
+浍
+腋
+姑
+菖
+凌
+涞
+麽
+锢
+桨
+潢
+绎
+镰
+殆
+锑
+渝
+铬
+困
+绽
+觎
+匈
+糙
+暑
+裹
+鸟
+盔
+肽
+迷
+綦
+『
+亳
+佝
+俘
+钴
+觇
+骥
+仆
+疝
+跪
+婶
+郯
+瀹
+唉
+脖
+踞
+针
+晾
+忒
+扼
+瞩
+叛
+椒
+疟
+嗡
+邗
+肆
+跆
+玫
+忡
+捣
+咧
+唆
+艄
+蘑
+潦
+笛
+阚
+沸
+泻
+掊
+菽
+贫
+斥
+髂
+孢
+镂
+赂
+麝
+鸾
+屡
+衬
+苷
+恪
+叠
+希
+粤
+爻
+喝
+茫
+惬
+郸
+绻
+庸
+撅
+碟
+宄
+妹
+膛
+叮
+饵
+崛
+嗲
+椅
+冤
+搅
+咕
+敛
+尹
+垦
+闷
+蝉
+霎
+勰
+败
+蓑
+泸
+肤
+鹌
+幌
+焦
+浠
+鞍
+刁
+舰
+乙
+竿
+裔
+。
+茵
+函
+伊
+兄
+丨
+娜
+匍
+謇
+莪
+宥
+似
+蝽
+翳
+酪
+翠
+粑
+薇
+祢
+骏
+赠
+叫
+Q
+噤
+噻
+竖
+芗
+莠
+潭
+俊
+羿
+耜
+O
+郫
+趁
+嗪
+囚
+蹶
+芒
+洁
+笋
+鹑
+敲
+硝
+啶
+堡
+渲
+揩
+』
+携
+宿
+遒
+颍
+扭
+棱
+割
+萜
+蔸
+葵
+琴
+捂
+饰
+衙
+耿
+掠
+募
+岂
+窖
+涟
+蔺
+瘤
+柞
+瞪
+怜
+匹
+距
+楔
+炜
+哆
+秦
+缎
+幼
+茁
+绪
+痨
+恨
+楸
+娅
+瓦
+桩
+雪
+嬴
+伏
+榔
+妥
+铿
+拌
+眠
+雍
+缇
+‘
+卓
+搓
+哌
+觞
+噩
+屈
+哧
+髓
+咦
+巅
+娑
+侑
+淫
+膳
+祝
+勾
+姊
+莴
+胄
+疃
+薛
+蜷
+胛
+巷
+芙
+芋
+熙
+闰
+勿
+窃
+狱
+剩
+钏
+幢
+陟
+铛
+慧
+靴
+耍
+k
+浙
+浇
+飨
+惟
+绗
+祜
+澈
+啼
+咪
+磷
+摞
+诅
+郦
+抹
+跃
+壬
+吕
+肖
+琏
+颤
+尴
+剡
+抠
+凋
+赚
+泊
+津
+宕
+殷
+倔
+氲
+漫
+邺
+涎
+怠
+$
+垮
+荬
+遵
+俏
+叹
+噢
+饽
+蜘
+孙
+筵
+疼
+鞭
+羧
+牦
+箭
+潴
+c
+眸
+祭
+髯
+啖
+坳
+愁
+芩
+驮
+倡
+巽
+穰
+沃
+胚
+怒
+凤
+槛
+剂
+趵
+嫁
+v
+邢
+灯
+鄢
+桐
+睽
+檗
+锯
+槟
+婷
+嵋
+圻
+诗
+蕈
+颠
+遭
+痢
+芸
+怯
+馥
+竭
+锗
+徜
+恭
+遍
+籁
+剑
+嘱
+苡
+龄
+僧
+桑
+潸
+弘
+澶
+楹
+悲
+讫
+愤
+腥
+悸
+谍
+椹
+呢
+桓
+葭
+攫
+阀
+翰
+躲
+敖
+柑
+郎
+笨
+橇
+呃
+魁
+燎
+脓
+葩
+磋
+垛
+玺
+狮
+沓
+砜
+蕊
+锺
+罹
+蕉
+翱
+虐
+闾
+巫
+旦
+茱
+嬷
+枯
+鹏
+贡
+芹
+汛
+矫
+绁
+拣
+禺
+佃
+讣
+舫
+惯
+乳
+趋
+疲
+挽
+岚
+虾
+衾
+蠹
+蹂
+飓
+氦
+铖
+孩
+稞
+瑜
+壅
+掀
+勘
+妓
+畅
+髋
+W
+庐
+牲
+蓿
+榕
+练
+垣
+唱
+邸
+菲
+昆
+婺
+穿
+绡
+麒
+蚱
+掂
+愚
+泷
+涪
+漳
+妩
+娉
+榄
+讷
+觅
+旧
+藤
+煮
+呛
+柳
+腓
+叭
+庵
+烷
+阡
+罂
+蜕
+擂
+猖
+咿
+媲
+脉
+【
+沏
+貅
+黠
+熏
+哲
+烁
+坦
+酵
+兜
+×
+潇
+撒
+剽
+珩
+圹
+乾
+摸
+樟
+帽
+嗒
+襄
+魂
+轿
+憬
+锡
+〕
+喃
+皆
+咖
+隅
+脸
+残
+泮
+袂
+鹂
+珊
+囤
+捆
+咤
+误
+徨
+闹
+淙
+芊
+淋
+怆
+囗
+拨
+梳
+渤
+R
+G
+绨
+蚓
+婀
+幡
+狩
+麾
+谢
+唢
+裸
+旌
+伉
+纶
+裂
+驳
+砼
+咛
+澄
+樨
+蹈
+宙
+澍
+倍
+貔
+操
+勇
+蟠
+摈
+砧
+虬
+够
+缁
+悦
+藿
+撸
+艹
+摁
+淹
+豇
+虎
+榭
+ˉ
+吱
+d
+°
+喧
+荀
+踱
+侮
+奋
+偕
+饷
+犍
+惮
+坑
+璎
+徘
+宛
+妆
+袈
+倩
+窦
+昂
+荏
+乖
+K
+怅
+撰
+鳙
+牙
+袁
+酞
+X
+痿
+琼
+闸
+雁
+趾
+荚
+虻
+涝
+《
+杏
+韭
+偈
+烤
+绫
+鞘
+卉
+症
+遢
+蓥
+诋
+杭
+荨
+匆
+竣
+簪
+辙
+敕
+虞
+丹
+缭
+咩
+黟
+m
+淤
+瑕
+咂
+铉
+硼
+茨
+嶂
+痒
+畸
+敬
+涿
+粪
+窘
+熟
+叔
+嫔
+盾
+忱
+裘
+憾
+梵
+赡
+珙
+咯
+娘
+庙
+溯
+胺
+葱
+痪
+摊
+荷
+卞
+乒
+髦
+寐
+铭
+坩
+胗
+枷
+爆
+溟
+嚼
+羚
+砬
+轨
+惊
+挠
+罄
+竽
+菏
+氧
+浅
+楣
+盼
+枢
+炸
+阆
+杯
+谏
+噬
+淇
+渺
+俪
+秆
+墓
+泪
+跻
+砌
+痰
+垡
+渡
+耽
+釜
+讶
+鳎
+煞
+呗
+韶
+舶
+绷
+鹳
+缜
+旷
+铊
+皱
+龌
+檀
+霖
+奄
+槐
+艳
+蝶
+旋
+哝
+赶
+骞
+蚧
+腊
+盈
+丁
+`
+蜚
+矸
+蝙
+睨
+嚓
+僻
+鬼
+醴
+夜
+彝
+磊
+笔
+拔
+栀
+糕
+厦
+邰
+纫
+逭
+纤
+眦
+膊
+馍
+躇
+烯
+蘼
+冬
+诤
+暄
+骶
+哑
+瘠
+」
+臊
+丕
+愈
+咱
+螺
+擅
+跋
+搏
+硪
+谄
+笠
+淡
+嘿
+骅
+谧
+鼎
+皋
+姚
+歼
+蠢
+驼
+耳
+胬
+挝
+涯
+狗
+蒽
+孓
+犷
+凉
+芦
+箴
+铤
+孤
+嘛
+坤
+V
+茴
+朦
+挞
+尖
+橙
+诞
+搴
+碇
+洵
+浚
+帚
+蜍
+漯
+柘
+嚎
+讽
+芭
+荤
+咻
+祠
+秉
+跖
+埃
+吓
+糯
+眷
+馒
+惹
+娼
+鲑
+嫩
+讴
+轮
+瞥
+靶
+褚
+乏
+缤
+宋
+帧
+删
+驱
+碎
+扑
+俩
+俄
+偏
+涣
+竹
+噱
+皙
+佰
+渚
+唧
+斡
+#
+镉
+刀
+崎
+筐
+佣
+夭
+贰
+肴
+峙
+哔
+艿
+匐
+牺
+镛
+缘
+仡
+嫡
+劣
+枸
+堀
+梨
+簿
+鸭
+蒸
+亦
+稽
+浴
+{
+衢
+束
+槲
+j
+阁
+揍
+疥
+棋
+潋
+聪
+窜
+乓
+睛
+插
+冉
+阪
+苍
+搽
+「
+蟾
+螟
+幸
+仇
+樽
+撂
+慢
+跤
+幔
+俚
+淅
+覃
+觊
+溶
+妖
+帛
+侨
+曰
+妾
+泗
+·
+：
+瀘
+風
+Ë
+（
+）
+∶
+紅
+紗
+瑭
+雲
+頭
+鶏
+財
+許
+•
+¥
+樂
+焗
+麗
+—
+；
+滙
+東
+榮
+繪
+興
+…
+門
+業
+π
+楊
+國
+顧
+é
+盤
+寳
+Λ
+龍
+鳳
+島
+誌
+緣
+結
+銭
+萬
+勝
+祎
+璟
+優
+歡
+臨
+時
+購
+＝
+★
+藍
+昇
+鐵
+觀
+勅
+農
+聲
+畫
+兿
+術
+發
+劉
+記
+專
+耑
+園
+書
+壴
+種
+Ο
+●
+褀
+號
+銀
+匯
+敟
+锘
+葉
+橪
+廣
+進
+蒄
+鑽
+阝
+祙
+貢
+鍋
+豊
+夬
+喆
+團
+閣
+開
+燁
+賓
+館
+酡
+沔
+順
+＋
+硚
+劵
+饸
+陽
+車
+湓
+復
+萊
+氣
+軒
+華
+堃
+迮
+纟
+戶
+馬
+學
+裡
+電
+嶽
+獨
+マ
+シ
+サ
+ジ
+燘
+袪
+環
+❤
+臺
+灣
+専
+賣
+孖
+聖
+攝
+線
+▪
+α
+傢
+俬
+夢
+達
+莊
+喬
+貝
+薩
+劍
+羅
+壓
+棛
+饦
+尃
+璈
+囍
+醫
+Ｇ
+Ｉ
+Ａ
+＃
+Ｎ
+鷄
+髙
+嬰
+啓
+約
+隹
+潔
+賴
+藝
+～
+寶
+籣
+麺
+　
+嶺
+√
+義
+網
+峩
+長
+∧
+魚
+機
+構
+②
+鳯
+偉
+Ｌ
+Ｂ
+㙟
+畵
+鴿
+＇
+詩
+溝
+嚞
+屌
+藔
+佧
+玥
+蘭
+織
+１
+３
+９
+０
+７
+點
+砭
+鴨
+鋪
+銘
+廳
+弍
+‧
+創
+湯
+坶
+℃
+卩
+骝
+＆
+烜
+荘
+當
+潤
+扞
+係
+懷
+碶
+钅
+蚨
+讠
+☆
+叢
+爲
+埗
+涫
+塗
+→
+楽
+現
+鯨
+愛
+瑪
+鈺
+忄
+悶
+藥
+飾
+樓
+視
+孬
+ㆍ
+燚
+苪
+師
+①
+丼
+锽
+│
+韓
+標
+è
+兒
+閏
+匋
+張
+漢
+Ü
+髪
+會
+閑
+檔
+習
+裝
+の
+峯
+菘
+輝
+И
+雞
+釣
+億
+浐
+Ｋ
+Ｏ
+Ｒ
+８
+Ｈ
+Ｅ
+Ｐ
+Ｔ
+Ｗ
+Ｄ
+Ｓ
+Ｃ
+Ｍ
+Ｆ
+姌
+饹
+»
+晞
+廰
+ä
+嵯
+鷹
+負
+飲
+絲
+冚
+楗
+澤
+綫
+區
+❋
+←
+質
+靑
+揚
+③
+滬
+統
+産
+協
+﹑
+乸
+畐
+經
+運
+際
+洺
+岽
+為
+粵
+諾
+崋
+豐
+碁
+ɔ
+Ｖ
+２
+６
+齋
+誠
+訂
+´
+勑
+雙
+陳
+無
+í
+泩
+媄
+夌
+刂
+ｉ
+ｃ
+ｔ
+ｏ
+ｒ
+ａ
+嘢
+耄
+燴
+暃
+壽
+媽
+靈
+抻
+體
+唻
+É
+冮
+甹
+鎮
+錦
+ʌ
+蜛
+蠄
+尓
+駕
+戀
+飬
+逹
+倫
+貴
+極
+Я
+Й
+寬
+磚
+嶪
+郎
+職
+｜
+間
+ｎ
+ｄ
+剎
+伈
+課
+飛
+橋
+瘊
+№
+譜
+骓
+圗
+滘
+縣
+粿
+咅
+養
+濤
+彳
+®
+％
+Ⅱ
+啰
+㴪
+見
+矞
+薬
+糁
+邨
+鲮
+顔
+罱
+З
+選
+話
+贏
+氪
+俵
+競
+瑩
+繡
+枱
+β
+綉
+á
+獅
+爾
+™
+麵
+戋
+淩
+徳
+個
+劇
+場
+務
+簡
+寵
+ｈ
+實
+膠
+轱
+圖
+築
+嘣
+樹
+㸃
+營
+耵
+孫
+饃
+鄺
+飯
+麯
+遠
+輸
+坫
+孃
+乚
+閃
+鏢
+㎡
+題
+廠
+關
+↑
+爺
+將
+軍
+連
+篦
+覌
+參
+箸
+－
+窠
+棽
+寕
+夀
+爰
+歐
+呙
+閥
+頡
+熱
+雎
+垟
+裟
+凬
+勁
+帑
+馕
+夆
+疌
+枼
+馮
+貨
+蒤
+樸
+彧
+旸
+靜
+龢
+暢
+㐱
+鳥
+珺
+鏡
+灡
+爭
+堷
+廚
+Ó
+騰
+診
+┅
+蘇
+褔
+凱
+頂
+豕
+亞
+帥
+嘬
+⊥
+仺
+桖
+複
+饣
+絡
+穂
+顏
+棟
+納
+▏
+濟
+親
+設
+計
+攵
+埌
+烺
+ò
+頤
+燦
+蓮
+撻
+節
+講
+濱
+濃
+娽
+洳
+朿
+燈
+鈴
+護
+膚
+铔
+過
+補
+Ｚ
+Ｕ
+５
+４
+坋
+闿
+䖝
+餘
+缐
+铞
+貿
+铪
+桼
+趙
+鍊
+［
+㐂
+垚
+菓
+揸
+捲
+鐘
+滏
+𣇉
+爍
+輪
+燜
+鴻
+鮮
+動
+鹞
+鷗
+丄
+慶
+鉌
+翥
+飮
+腸
+⇋
+漁
+覺
+來
+熘
+昴
+翏
+鲱
+圧
+鄉
+萭
+頔
+爐
+嫚
+г
+貭
+類
+聯
+幛
+輕
+訓
+鑒
+夋
+锨
+芃
+珣
+䝉
+扙
+嵐
+銷
+處
+ㄱ
+語
+誘
+苝
+歸
+儀
+燒
+楿
+內
+粢
+葒
+奧
+麥
+礻
+滿
+蠔
+穵
+瞭
+態
+鱬
+榞
+硂
+鄭
+黃
+煙
+祐
+奓
+逺
+＊
+瑄
+獲
+聞
+薦
+讀
+這
+樣
+決
+問
+啟
+們
+執
+説
+轉
+單
+隨
+唘
+帶
+倉
+庫
+還
+贈
+尙
+皺
+■
+餅
+產
+○
+∈
+報
+狀
+楓
+賠
+琯
+嗮
+禮
+｀
+傳
+＞
+≤
+嗞
+Φ
+≥
+換
+咭
+∣
+↓
+曬
+ε
+応
+寫
+″
+終
+様
+純
+費
+療
+聨
+凍
+壐
+郵
+ü
+黒
+∫
+製
+塊
+調
+軽
+確
+撃
+級
+馴
+Ⅲ
+涇
+繹
+數
+碼
+證
+狒
+処
+劑
+＜
+晧
+賀
+衆
+］
+櫥
+兩
+陰
+絶
+對
+鯉
+憶
+◎
+ｐ
+ｅ
+Ｙ
+蕒
+煖
+頓
+測
+試
+鼽
+僑
+碩
+妝
+帯
+≈
+鐡
+舖
+權
+喫
+倆
+ˋ
+該
+悅
+ā
+俫
+．
+ｆ
+ｓ
+ｂ
+ｍ
+ｋ
+ｇ
+ｕ
+ｊ
+貼
+淨
+濕
+針
+適
+備
+ｌ
+／
+給
+謢
+強
+觸
+衛
+與
+⊙
+＄
+緯
+變
+⑴
+⑵
+⑶
+㎏
+殺
+∩
+幚
+─
+價
+▲
+離
+ú
+ó
+飄
+烏
+関
+閟
+﹝
+﹞
+邏
+輯
+鍵
+驗
+訣
+導
+歷
+屆
+層
+▼
+儱
+錄
+熳
+ē
+艦
+吋
+錶
+辧
+飼
+顯
+④
+禦
+販
+気
+対
+枰
+閩
+紀
+幹
+瞓
+貊
+淚
+△
+眞
+墊
+Ω
+獻
+褲
+縫
+緑
+亜
+鉅
+餠
+｛
+｝
+◆
+蘆
+薈
+█
+◇
+溫
+彈
+晳
+粧
+犸
+穩
+訊
+崬
+凖
+熥
+П
+舊
+條
+紋
+圍
+Ⅳ
+筆
+尷
+難
+雜
+錯
+綁
+識
+頰
+鎖
+艶
+□
+殁
+殼
+⑧
+├
+▕
+鵬
+ǐ
+ō
+ǒ
+糝
+綱
+▎
+μ
+盜
+饅
+醬
+籤
+蓋
+釀
+鹽
+據
+à
+ɡ
+辦
+◥
+彐
+┌
+婦
+獸
+鲩
+伱
+ī
+蒟
+蒻
+齊
+袆
+腦
+寧
+凈
+妳
+煥
+詢
+偽
+謹
+啫
+鯽
+騷
+鱸
+損
+傷
+鎻
+髮
+買
+冏
+儥
+両
+﹢
+∞
+載
+喰
+ｚ
+羙
+悵
+燙
+曉
+員
+組
+徹
+艷
+痠
+鋼
+鼙
+縮
+細
+嚒
+爯
+≠
+維
+＂
+鱻
+壇
+厍
+帰
+浥
+犇
+薡
+軎
+²
+應
+醜
+刪
+緻
+鶴
+賜
+噁
+軌
+尨
+镔
+鷺
+槗
+彌
+葚
+濛
+請
+溇
+緹
+賢
+訪
+獴
+瑅
+資
+縤
+陣
+蕟
+栢
+韻
+祼
+恁
+伢
+謝
+劃
+涑
+總
+衖
+踺
+砋
+凉
+籃
+駿
+苼
+瘋
+昽
+紡
+驊
+腎
+﹗
+響
+杋
+剛
+嚴
+禪
+歓
+槍
+傘
+檸
+檫
+炣
+勢
+鏜
+鎢
+銑
+尐
+減
+奪
+惡
+θ
+僮
+婭
+臘
+ū
+ì
+殻
+鉄
+∑
+蛲
+焼
+緖
+續
+紹
+懮
\ No newline at end of file
diff --git a/examples/PPOCR/PPOCR-System/model/test1.jpg b/examples/PPOCR/PPOCR-System/model/test.jpg
similarity index 100%
rename from examples/PPOCR/PPOCR-System/model/test1.jpg
rename to examples/PPOCR/PPOCR-System/model/test.jpg
diff --git a/examples/PPOCR/PPOCR-System/python/ppocr_cls.py b/examples/PPOCR/PPOCR-System/python/ppocr_cls.py
deleted file mode 100644
index 56a1657..0000000
--- a/examples/PPOCR/PPOCR-System/python/ppocr_cls.py
+++ /dev/null
@@ -1,171 +0,0 @@
-# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-import os
-import sys
-
-__dir__ = os.path.dirname(os.path.abspath(__file__))
-sys.path.append(__dir__)
-sys.path.insert(0, os.path.abspath(os.path.join(__dir__, '../..')))
-
-os.environ["FLAGS_allocator_strategy"] = 'auto_growth'
-
-import cv2
-import copy
-import numpy as np
-import math
-import time
-import traceback
-
-import utility as utility
-from paddleocr.ppocr.postprocess import build_post_process
-from paddleocr.ppocr.utils.logging import get_logger
-from paddleocr.ppocr.utils.utility import get_image_file_list, check_and_read
-
-logger = get_logger()
-
-
-class TextClassifier(object):
-    def __init__(self, args):
-        self.cls_image_shape = [int(v) for v in args.cls_image_shape.split(",")]
-        self.cls_batch_num = args.cls_batch_num
-        self.cls_thresh = args.cls_thresh
-        postprocess_params = {
-            'name': 'ClsPostProcess',
-            "label_list": args.label_list,
-        }
-        self.postprocess_op = build_post_process(postprocess_params)
-        self.predictor, self.input_tensor, self.output_tensors, _ = \
-            utility.create_predictor(args, 'cls', logger)
-        self.use_onnx = args.use_onnx
-        self.use_rknn = args.use_rknn
-
-    def resize_norm_img(self, img):
-        imgC, imgH, imgW = self.cls_image_shape
-        h = img.shape[0]
-        w = img.shape[1]
-        ratio = w / float(h)
-        if math.ceil(imgH * ratio) > imgW:
-            resized_w = imgW
-        else:
-            resized_w = int(math.ceil(imgH * ratio))
-        resized_image = cv2.resize(img, (resized_w, imgH))
-        if self.use_rknn:
-            padding_im = np.zeros((imgH, imgW, imgC), dtype=np.float32)
-            padding_im[:, 0:resized_w, :] = resized_image
-            return padding_im
-        resized_image = resized_image.astype('float32')
-        if self.cls_image_shape[0] == 1:
-            resized_image = resized_image / 255
-            resized_image = resized_image[np.newaxis, :]
-        else:
-            resized_image = resized_image.transpose((2, 0, 1)) / 255
-        resized_image -= 0.5
-        resized_image /= 0.5
-        padding_im = np.zeros((imgC, imgH, imgW), dtype=np.float32)
-        padding_im[:, :, 0:resized_w] = resized_image
-        return padding_im
-
-    def release_rknn(self):
-        self.predictor.release()
-
-    def __call__(self, img_list):
-        img_list = copy.deepcopy(img_list)
-        img_num = len(img_list)
-        # Calculate the aspect ratio of all text bars
-        width_list = []
-        for img in img_list:
-            width_list.append(img.shape[1] / float(img.shape[0]))
-        # Sorting can speed up the cls process
-        indices = np.argsort(np.array(width_list))
-
-        cls_res = [['', 0.0]] * img_num
-        batch_num = self.cls_batch_num
-        elapse = 0
-        for beg_img_no in range(0, img_num, batch_num):
-
-            end_img_no = min(img_num, beg_img_no + batch_num)
-            norm_img_batch = []
-            max_wh_ratio = 0
-            starttime = time.time()
-            for ino in range(beg_img_no, end_img_no):
-                h, w = img_list[indices[ino]].shape[0:2]
-                wh_ratio = w * 1.0 / h
-                max_wh_ratio = max(max_wh_ratio, wh_ratio)
-            for ino in range(beg_img_no, end_img_no):
-                norm_img = self.resize_norm_img(img_list[indices[ino]])
-                norm_img = norm_img[np.newaxis, :]
-                norm_img_batch.append(norm_img)
-            norm_img_batch = np.concatenate(norm_img_batch)
-            norm_img_batch = norm_img_batch.copy()
-
-            if self.use_onnx:
-                input_dict = {}
-                prob_out = []
-                for idx in range(norm_img_batch.shape[0]):
-                    input_dict[self.input_tensor.name] = norm_img_batch[idx:idx+1]
-                    output = self.predictor.run(self.output_tensors, input_dict)
-                    prob_out.append(output[0])
-                prob_out = np.concatenate(prob_out)
-            elif self.use_rknn:
-                prob_out = []
-                for idx in range(norm_img_batch.shape[0]):
-                    img = norm_img_batch[idx]
-                    output = self.predictor.inference(inputs=[img])
-                    prob_out.append(output[0])
-                prob_out = np.concatenate(prob_out)
-            else:
-                self.input_tensor.copy_from_cpu(norm_img_batch)
-                self.predictor.run()
-                prob_out = self.output_tensors[0].copy_to_cpu()
-                self.predictor.try_shrink_memory()
-            cls_result = self.postprocess_op(prob_out)
-            elapse += time.time() - starttime
-            for rno in range(len(cls_result)):
-                label, score = cls_result[rno]
-                cls_res[indices[beg_img_no + rno]] = [label, score]
-                if '180' in label and score > self.cls_thresh:
-                    img_list[indices[beg_img_no + rno]] = cv2.rotate(
-                        img_list[indices[beg_img_no + rno]], 1)
-        return img_list, cls_res, elapse
-
-
-def main(args):
-    image_file_list = get_image_file_list(args.image_dir)
-    text_classifier = TextClassifier(args)
-    valid_image_file_list = []
-    img_list = []
-    for image_file in image_file_list:
-        img, flag, _ = check_and_read(image_file)
-        if not flag:
-            img = cv2.imread(image_file)
-        if img is None:
-            logger.info("error in loading image:{}".format(image_file))
-            continue
-        valid_image_file_list.append(image_file)
-        img_list.append(img)
-    try:
-        img_list, cls_res, predict_time = text_classifier(img_list)
-    except Exception as E:
-        logger.info(traceback.format_exc())
-        logger.info(E)
-        exit()
-    for ino in range(len(img_list)):
-        logger.info("Predicts of {}:{}".format(valid_image_file_list[ino],
-                                               cls_res[ino]))
-    if args.use_rknn:
-        text_classifier.release_rknn()
-
-
-if __name__ == "__main__":
-    main(utility.parse_args())
diff --git a/examples/PPOCR/PPOCR-System/python/ppocr_det.py b/examples/PPOCR/PPOCR-System/python/ppocr_det.py
index f4d66b7..b6d915b 100644
--- a/examples/PPOCR/PPOCR-System/python/ppocr_det.py
+++ b/examples/PPOCR/PPOCR-System/python/ppocr_det.py
@@ -13,367 +13,113 @@
 # limitations under the License.
 import os
 import sys
-
-__dir__ = os.path.dirname(os.path.abspath(__file__))
-sys.path.append(__dir__)
-sys.path.insert(0, os.path.abspath(os.path.join(__dir__, '../..')))
-
-os.environ["FLAGS_allocator_strategy"] = 'auto_growth'
-
 import cv2
 import numpy as np
-import time
-import sys
+import argparse
+import utils.operators
+from utils.db_postprocess import DBPostProcess, DetPostProcess
 
-import utility
-from paddleocr.ppocr.utils.logging import get_logger
-from paddleocr.ppocr.utils.utility import get_image_file_list, check_and_read
-from paddleocr.ppocr.data import create_operators, transform
-from paddleocr.ppocr.postprocess import build_post_process
-import json
-logger = get_logger()
+# add path
+realpath = os.path.abspath(__file__)
+_sep = os.path.sep
+realpath = realpath.split(_sep)
+sys.path.append(os.path.join(realpath[0]+_sep, *realpath[1:realpath.index('rknn_model_zoo')+1]))
 
 
-class TextDetector(object):
-    def __init__(self, args):
-        self.args = args
-        self.det_algorithm = args.det_algorithm
-        self.use_onnx = args.use_onnx
-        self.use_rknn = args.use_rknn
-        pre_process_list = [{
-            'DetResizeForTest': {
-                'limit_side_len': args.det_limit_side_len,
-                'limit_type': args.det_limit_type,
+DET_INPUT_SHAPE = [480, 480] # h,w
+
+ONNX_PRE_PROCESS_CONFIG = [
+        {
+            'DetResizeForTest': 
+            {
+                'limit_side_len': 480,
+                'limit_type': 'max',
             }
-        }, {
+        }, 
+        {
             'NormalizeImage': {
                 'std': [0.229, 0.224, 0.225],
                 'mean': [0.485, 0.456, 0.406],
                 'scale': '1./255.',
                 'order': 'hwc'
             }
-        }, {
-            'ToCHWImage': None
-        }, {
-            'KeepKeys': {
-                'keep_keys': ['image', 'shape']
-            }
-        }]
-        postprocess_params = {}
-        if self.det_algorithm == "DB":
-            postprocess_params['name'] = 'DBPostProcess'
-            postprocess_params["thresh"] = args.det_db_thresh
-            postprocess_params["box_thresh"] = args.det_db_box_thresh
-            postprocess_params["max_candidates"] = 1000
-            postprocess_params["unclip_ratio"] = args.det_db_unclip_ratio
-            postprocess_params["use_dilation"] = args.use_dilation
-            postprocess_params["score_mode"] = args.det_db_score_mode
-            postprocess_params["box_type"] = args.det_box_type
-        elif self.det_algorithm == "DB++":
-            postprocess_params['name'] = 'DBPostProcess'
-            postprocess_params["thresh"] = args.det_db_thresh
-            postprocess_params["box_thresh"] = args.det_db_box_thresh
-            postprocess_params["max_candidates"] = 1000
-            postprocess_params["unclip_ratio"] = args.det_db_unclip_ratio
-            postprocess_params["use_dilation"] = args.use_dilation
-            postprocess_params["score_mode"] = args.det_db_score_mode
-            postprocess_params["box_type"] = args.det_box_type
-            pre_process_list[1] = {
-                'NormalizeImage': {
-                    'std': [1.0, 1.0, 1.0],
-                    'mean':
-                    [0.48109378172549, 0.45752457890196, 0.40787054090196],
-                    'scale': '1./255.',
-                    'order': 'hwc'
-                }
-            }
-        elif self.det_algorithm == "EAST":
-            postprocess_params['name'] = 'EASTPostProcess'
-            postprocess_params["score_thresh"] = args.det_east_score_thresh
-            postprocess_params["cover_thresh"] = args.det_east_cover_thresh
-            postprocess_params["nms_thresh"] = args.det_east_nms_thresh
-        elif self.det_algorithm == "SAST":
-            pre_process_list[0] = {
-                'DetResizeForTest': {
-                    'resize_long': args.det_limit_side_len
-                }
-            }
-            postprocess_params['name'] = 'SASTPostProcess'
-            postprocess_params["score_thresh"] = args.det_sast_score_thresh
-            postprocess_params["nms_thresh"] = args.det_sast_nms_thresh
-
-            if args.det_box_type == 'poly':
-                postprocess_params["sample_pts_num"] = 6
-                postprocess_params["expand_scale"] = 1.2
-                postprocess_params["shrink_ratio_of_width"] = 0.2
-            else:
-                postprocess_params["sample_pts_num"] = 2
-                postprocess_params["expand_scale"] = 1.0
-                postprocess_params["shrink_ratio_of_width"] = 0.3
-
-        elif self.det_algorithm == "PSE":
-            postprocess_params['name'] = 'PSEPostProcess'
-            postprocess_params["thresh"] = args.det_pse_thresh
-            postprocess_params["box_thresh"] = args.det_pse_box_thresh
-            postprocess_params["min_area"] = args.det_pse_min_area
-            postprocess_params["box_type"] = args.det_box_type
-            postprocess_params["scale"] = args.det_pse_scale
-        elif self.det_algorithm == "FCE":
-            pre_process_list[0] = {
-                'DetResizeForTest': {
-                    'rescale_img': [1080, 736]
-                }
-            }
-            postprocess_params['name'] = 'FCEPostProcess'
-            postprocess_params["scales"] = args.scales
-            postprocess_params["alpha"] = args.alpha
-            postprocess_params["beta"] = args.beta
-            postprocess_params["fourier_degree"] = args.fourier_degree
-            postprocess_params["box_type"] = args.det_box_type
-        elif self.det_algorithm == "CT":
-            pre_process_list[0] = {'ScaleAlignedShort': {'short_size': 640}}
-            postprocess_params['name'] = 'CTPostProcess'
-        else:
-            logger.info("unknown det_algorithm:{}".format(self.det_algorithm))
-            sys.exit(0)
-
-        self.preprocess_op = create_operators(pre_process_list)
-        self.postprocess_op = build_post_process(postprocess_params)
-        self.predictor, self.input_tensor, self.output_tensors, self.config = utility.create_predictor(
-            args, 'det', logger)
+        }, 
+        ]
 
-        if self.use_onnx:
-            img_h, img_w = self.input_tensor.shape[2:]
-            if isinstance(img_h, str) or isinstance(img_w, str):
-                pass
-            elif img_h is not None and img_w is not None and img_h > 0 and img_w > 0:
-                pre_process_list[0] = {
-                    'DetResizeForTest': {
-                        'image_shape': [img_h, img_w]
-                    }
-                }
-
-        if self.use_rknn:
-            pre_process_list[0] = {
-                'DetResizeForTest': {
-                        'image_shape': args.det_image_shape
+RKNN_PRE_PROCESS_CONFIG = [
+        {
+            'DetResizeForTest': {
+                    'image_shape': DET_INPUT_SHAPE
                 }
-            }
-            pre_process_list[1] = {
-                'NormalizeImage': {
+         }, 
+        {
+            'NormalizeImage': 
+            {
                     'std': [1., 1., 1.],
                     'mean': [0., 0., 0.],
                     'scale': '1.',
                     'order': 'hwc'
-                }
             }
-        self.preprocess_op = create_operators(pre_process_list)
-
-        if args.benchmark:
-            import auto_log
-            pid = os.getpid()
-            gpu_id = utility.get_infer_gpuid()
-            self.autolog = auto_log.AutoLogger(
-                model_name="det",
-                model_precision=args.precision,
-                batch_size=1,
-                data_shape="dynamic",
-                save_path=None,
-                inference_config=self.config,
-                pids=pid,
-                process_name=None,
-                gpu_ids=gpu_id if args.use_gpu else None,
-                time_keys=[
-                    'preprocess_time', 'inference_time', 'postprocess_time'
-                ],
-                warmup=2,
-                logger=logger)
-
-    def order_points_clockwise(self, pts):
-        rect = np.zeros((4, 2), dtype="float32")
-        s = pts.sum(axis=1)
-        rect[0] = pts[np.argmin(s)]
-        rect[2] = pts[np.argmax(s)]
-        tmp = np.delete(pts, (np.argmin(s), np.argmax(s)), axis=0)
-        diff = np.diff(np.array(tmp), axis=1)
-        rect[1] = tmp[np.argmin(diff)]
-        rect[3] = tmp[np.argmax(diff)]
-        return rect
-
-    def clip_det_res(self, points, img_height, img_width):
-        for pno in range(points.shape[0]):
-            points[pno, 0] = int(min(max(points[pno, 0], 0), img_width - 1))
-            points[pno, 1] = int(min(max(points[pno, 1], 0), img_height - 1))
-        return points
-
-    def filter_tag_det_res(self, dt_boxes, image_shape):
-        img_height, img_width = image_shape[0:2]
-        dt_boxes_new = []
-        for box in dt_boxes:
-            if type(box) is list:
-                box = np.array(box)
-            box = self.order_points_clockwise(box)
-            box = self.clip_det_res(box, img_height, img_width)
-            rect_width = int(np.linalg.norm(box[0] - box[1]))
-            rect_height = int(np.linalg.norm(box[0] - box[3]))
-            if rect_width <= 3 or rect_height <= 3:
-                continue
-            dt_boxes_new.append(box)
-        dt_boxes = np.array(dt_boxes_new)
-        return dt_boxes
-
-    def filter_tag_det_res_only_clip(self, dt_boxes, image_shape):
-        img_height, img_width = image_shape[0:2]
-        dt_boxes_new = []
-        for box in dt_boxes:
-            if type(box) is list:
-                box = np.array(box)
-            box = self.clip_det_res(box, img_height, img_width)
-            dt_boxes_new.append(box)
-        dt_boxes = np.array(dt_boxes_new)
-        return dt_boxes
-
-    def release_rknn(self):
-        self.predictor.release()
-
-    def __call__(self, img):
-        ori_im = img.copy()
-        data = {'image': img}
-
-        st = time.time()
-
-        if self.args.benchmark:
-            self.autolog.times.start()
-
-        data = transform(data, self.preprocess_op)
-        img, shape_list = data
-        if img is None:
-            return None, 0
-        img = np.expand_dims(img, axis=0)
-        shape_list = np.expand_dims(shape_list, axis=0)
-        img = img.copy()
-
-        if self.args.benchmark:
-            self.autolog.times.stamp()
-        if self.use_onnx:
-            input_dict = {}
-            input_dict[self.input_tensor.name] = img
-            outputs = self.predictor.run(self.output_tensors, input_dict)
-        elif self.use_rknn:
-            img = img[0].transpose(1, 2, 0)
-            outputs = self.predictor.inference(inputs=[img])
-        else:
-            self.input_tensor.copy_from_cpu(img)
-            self.predictor.run()
-            outputs = []
-            for output_tensor in self.output_tensors:
-                output = output_tensor.copy_to_cpu()
-                outputs.append(output)
-            if self.args.benchmark:
-                self.autolog.times.stamp()
-
-        preds = {}
-        if self.det_algorithm == "EAST":
-            preds['f_geo'] = outputs[0]
-            preds['f_score'] = outputs[1]
-        elif self.det_algorithm == 'SAST':
-            preds['f_border'] = outputs[0]
-            preds['f_score'] = outputs[1]
-            preds['f_tco'] = outputs[2]
-            preds['f_tvo'] = outputs[3]
-        elif self.det_algorithm in ['DB', 'PSE', 'DB++']:
-            preds['maps'] = outputs[0]
-        elif self.det_algorithm == 'FCE':
-            for i, output in enumerate(outputs):
-                preds['level_{}'.format(i)] = output
-        elif self.det_algorithm == "CT":
-            preds['maps'] = outputs[0]
-            preds['score'] = outputs[1]
-        else:
-            raise NotImplementedError
-
-        post_result = self.postprocess_op(preds, shape_list)
-        dt_boxes = post_result[0]['points']
-
-        if self.args.det_box_type == 'poly':
-            dt_boxes = self.filter_tag_det_res_only_clip(dt_boxes, ori_im.shape)
-        else:
-            dt_boxes = self.filter_tag_det_res(dt_boxes, ori_im.shape)
-
-        if self.args.benchmark:
-            self.autolog.times.end(stamp=True)
-        et = time.time()
-        return dt_boxes, et - st
-
-
-if __name__ == "__main__":
-    args = utility.parse_args()
-    image_file_list = get_image_file_list(args.image_dir)
-    text_detector = TextDetector(args)
-    total_time = 0
-    draw_img_save_dir = args.draw_img_save_dir
-    os.makedirs(draw_img_save_dir, exist_ok=True)
-
-    if args.warmup:
-        img = np.random.uniform(0, 255, [640, 640, 3]).astype(np.uint8)
-        for i in range(2):
-            res = text_detector(img)
-
-    save_results = []
-    for idx, image_file in enumerate(image_file_list):
-        img, flag_gif, flag_pdf = check_and_read(image_file)
-        if not flag_gif and not flag_pdf:
-            img = cv2.imread(image_file)
-        if not flag_pdf:
-            if img is None:
-                logger.debug("error in loading image:{}".format(image_file))
-                continue
-            imgs = [img]
-        else:
-            page_num = args.page_num
-            if page_num > len(img) or page_num == 0:
-                page_num = len(img)
-            imgs = img[:page_num]
-        for index, img in enumerate(imgs):
-            st = time.time()
-            dt_boxes, _ = text_detector(img)
-            elapse = time.time() - st
-            total_time += elapse
-            if len(imgs) > 1:
-                save_pred = os.path.basename(image_file) + '_' + str(
-                    index) + "\t" + str(
-                        json.dumps([x.tolist() for x in dt_boxes])) + "\n"
-            else:
-                save_pred = os.path.basename(image_file) + "\t" + str(
-                    json.dumps([x.tolist() for x in dt_boxes])) + "\n"
-            save_results.append(save_pred)
-            logger.info(save_pred)
-            if len(imgs) > 1:
-                logger.info("{}_{} The predict time of {}: {}".format(
-                    idx, index, image_file, elapse))
-            else:
-                logger.info("{} The predict time of {}: {}".format(
-                    idx, image_file, elapse))
-
-            src_im = utility.draw_text_det_res(dt_boxes, img)
-
-            if flag_gif:
-                save_file = image_file[:-3] + "png"
-            elif flag_pdf:
-                save_file = image_file.replace('.pdf',
-                                               '_' + str(index) + '.png')
-            else:
-                save_file = image_file
-            img_path = os.path.join(
-                draw_img_save_dir,
-                "det_res_{}".format(os.path.basename(save_file)))
-            cv2.imwrite(img_path, src_im)
-            logger.info("The visualized image saved in {}".format(img_path))
-
-    with open(os.path.join(draw_img_save_dir, "det_results.txt"), 'w') as f:
-        f.writelines(save_results)
-        f.close()
-    if args.benchmark:
-        text_detector.autolog.report()
-    if args.use_rknn:
-        text_detector.release_rknn()
+        }
+        ]
+
+POSTPROCESS_CONFIG = {
+    'DBPostProcess':{
+        'thresh': 0.3,
+        'box_thresh': 0.6,
+        'max_candidates': 1000,
+        'unclip_ratio': 1.5,
+        'use_dilation': False,
+        'score_mode': 'fast',
+    }
+}
+
+class TextDetector:
+    def __init__(self, args) -> None:
+        self.model, self.framework = setup_model(args)
+        self.preprocess_funct = []
+        PRE_PROCESS_CONFIG = ONNX_PRE_PROCESS_CONFIG if self.framework == 'onnx' else RKNN_PRE_PROCESS_CONFIG
+        for item in PRE_PROCESS_CONFIG:
+            for key in item:
+                pclass = getattr(utils.operators, key)
+                p = pclass(**item[key])
+                self.preprocess_funct.append(p)
+
+        self.db_postprocess = DBPostProcess(**POSTPROCESS_CONFIG['DBPostProcess'])
+        self.det_postprocess = DetPostProcess()
+
+    def preprocess(self, img):
+        for p in self.preprocess_funct:
+            img = p(img)
+
+        if self.framework == 'onnx':
+            image_input = img['image']
+            image_input = image_input.reshape(1, *image_input.shape)
+            image_input = image_input.transpose(0, 3, 1, 2)
+            img['image'] = image_input
+        return img
+
+    def run(self, img):
+        model_input = self.preprocess({'image':img})
+        output = self.model.run([model_input['image']])
+
+        preds = {'maps' : output[0].astype(np.float32)}
+        result = self.db_postprocess(preds, model_input['shape'])
+
+        output = self.det_postprocess.filter_tag_det_res(result[0]['points'], img.shape)
+        return output
+
+def setup_model(args):
+    model_path = args.det_model_path
+    if model_path.endswith('.rknn'):
+        platform = 'rknn'
+        from py_utils.rknn_executor import RKNN_model_container 
+        model = RKNN_model_container(model_path, args.target, args.device_id)
+    elif model_path.endswith('onnx'):
+        platform = 'onnx'
+        from py_utils.onnx_executor import ONNX_model_container
+        model = ONNX_model_container(model_path)
+    else:
+        assert False, "{} is not rknn/onnx model".format(model_path)
+    print('Model-{} is {} model, starting val'.format(model_path, platform))
+    return model, platform
\ No newline at end of file
diff --git a/examples/PPOCR/PPOCR-System/python/ppocr_rec.py b/examples/PPOCR/PPOCR-System/python/ppocr_rec.py
index 9bf7103..8795080 100644
--- a/examples/PPOCR/PPOCR-System/python/ppocr_rec.py
+++ b/examples/PPOCR/PPOCR-System/python/ppocr_rec.py
@@ -13,680 +13,85 @@
 # limitations under the License.
 import os
 import sys
-from PIL import Image
-__dir__ = os.path.dirname(os.path.abspath(__file__))
-sys.path.append(__dir__)
-sys.path.insert(0, os.path.abspath(os.path.join(__dir__, '../..')))
-
-os.environ["FLAGS_allocator_strategy"] = 'auto_growth'
-
+import argparse
 import cv2
 import numpy as np
-import math
-import time
-import traceback
+import utils.operators
+from utils.rec_postprocess import CTCLabelDecode
 
-import utility
-from paddleocr.ppocr.postprocess import build_post_process
-from paddleocr.ppocr.utils.logging import get_logger
-from paddleocr.ppocr.utils.utility import get_image_file_list, check_and_read
+# add path
+realpath = os.path.abspath(__file__)
+_sep = os.path.sep
+realpath = realpath.split(_sep)
+sys.path.append(os.path.join(realpath[0]+_sep, *realpath[1:realpath.index('rknn_model_zoo')+1]))
 
-logger = get_logger()
+os.environ["FLAGS_allocator_strategy"] = 'auto_growth'
 
+REC_INPUT_SHAPE = [48, 320] # h,w
+CHARACTER_DICT_PATH= '../model/ppocr_keys_v1.txt'
 
-class TextRecognizer(object):
-    def __init__(self, args):
-        self.rec_image_shape = [int(v) for v in args.rec_image_shape.split(",")]
-        self.rec_batch_num = args.rec_batch_num
-        self.rec_algorithm = args.rec_algorithm
-        postprocess_params = {
-            'name': 'CTCLabelDecode',
-            "character_dict_path": args.rec_char_dict_path,
-            "use_space_char": args.use_space_char
-        }
-        if self.rec_algorithm == "SRN":
-            postprocess_params = {
-                'name': 'SRNLabelDecode',
-                "character_dict_path": args.rec_char_dict_path,
-                "use_space_char": args.use_space_char
-            }
-        elif self.rec_algorithm == "RARE":
-            postprocess_params = {
-                'name': 'AttnLabelDecode',
-                "character_dict_path": args.rec_char_dict_path,
-                "use_space_char": args.use_space_char
-            }
-        elif self.rec_algorithm == 'NRTR':
-            postprocess_params = {
-                'name': 'NRTRLabelDecode',
-                "character_dict_path": args.rec_char_dict_path,
-                "use_space_char": args.use_space_char
-            }
-        elif self.rec_algorithm == "SAR":
-            postprocess_params = {
-                'name': 'SARLabelDecode',
-                "character_dict_path": args.rec_char_dict_path,
-                "use_space_char": args.use_space_char
-            }
-        elif self.rec_algorithm == "VisionLAN":
-            postprocess_params = {
-                'name': 'VLLabelDecode',
-                "character_dict_path": args.rec_char_dict_path,
-                "use_space_char": args.use_space_char
+PRE_PROCESS_CONFIG = [ 
+        {
+            'NormalizeImage': {
+                'std': [1, 1, 1],
+                'mean': [0, 0, 0],
+                'scale': '1./255.',
+                'order': 'hwc'
             }
-        elif self.rec_algorithm == 'ViTSTR':
-            postprocess_params = {
-                'name': 'ViTSTRLabelDecode',
-                "character_dict_path": args.rec_char_dict_path,
-                "use_space_char": args.use_space_char
-            }
-        elif self.rec_algorithm == 'ABINet':
-            postprocess_params = {
-                'name': 'ABINetLabelDecode',
-                "character_dict_path": args.rec_char_dict_path,
-                "use_space_char": args.use_space_char
-            }
-        elif self.rec_algorithm == "SPIN":
-            postprocess_params = {
-                'name': 'SPINLabelDecode',
-                "character_dict_path": args.rec_char_dict_path,
-                "use_space_char": args.use_space_char
-            }
-        elif self.rec_algorithm == "RobustScanner":
-            postprocess_params = {
-                'name': 'SARLabelDecode',
-                "character_dict_path": args.rec_char_dict_path,
-                "use_space_char": args.use_space_char,
-                "rm_symbol": True
-            }
-        elif self.rec_algorithm == 'RFL':
-            postprocess_params = {
-                'name': 'RFLLabelDecode',
-                "character_dict_path": None,
-                "use_space_char": args.use_space_char
-            }
-        elif self.rec_algorithm == "SATRN":
-            postprocess_params = {
-                'name': 'SATRNLabelDecode',
-                "character_dict_path": args.rec_char_dict_path,
-                "use_space_char": args.use_space_char,
-                "rm_symbol": True
-            }
-        elif self.rec_algorithm == "PREN":
-            postprocess_params = {'name': 'PRENLabelDecode'}
-        elif self.rec_algorithm == "CAN":
-            self.inverse = args.rec_image_inverse
-            postprocess_params = {
-                'name': 'CANLabelDecode',
-                "character_dict_path": args.rec_char_dict_path,
-                "use_space_char": args.use_space_char
-            }
-        self.postprocess_op = build_post_process(postprocess_params)
-        self.predictor, self.input_tensor, self.output_tensors, self.config = \
-            utility.create_predictor(args, 'rec', logger)
-        self.benchmark = args.benchmark
-        self.use_onnx = args.use_onnx
-        self.use_rknn = args.use_rknn
-        if args.benchmark:
-            import auto_log
-            pid = os.getpid()
-            gpu_id = utility.get_infer_gpuid()
-            self.autolog = auto_log.AutoLogger(
-                model_name="rec",
-                model_precision=args.precision,
-                batch_size=args.rec_batch_num,
-                data_shape="dynamic",
-                save_path=None,  #args.save_log_path,
-                inference_config=self.config,
-                pids=pid,
-                process_name=None,
-                gpu_ids=gpu_id if args.use_gpu else None,
-                time_keys=[
-                    'preprocess_time', 'inference_time', 'postprocess_time'
-                ],
-                warmup=0,
-                logger=logger)
-
-    def resize_norm_img(self, img, max_wh_ratio):
-        imgC, imgH, imgW = self.rec_image_shape
-        if self.rec_algorithm == 'NRTR' or self.rec_algorithm == 'ViTSTR':
-            img = cv2.cvtColor(img, cv2.COLOR_BGR2GRAY)
-            # return padding_im
-            image_pil = Image.fromarray(np.uint8(img))
-            if self.rec_algorithm == 'ViTSTR':
-                img = image_pil.resize([imgW, imgH], Image.BICUBIC)
-            else:
-                img = image_pil.resize([imgW, imgH], Image.LANCZOS)
-            img = np.array(img)
-            norm_img = np.expand_dims(img, -1)
-            norm_img = norm_img.transpose((2, 0, 1))
-            if self.rec_algorithm == 'ViTSTR':
-                norm_img = norm_img.astype(np.float32) / 255.
-            else:
-                norm_img = norm_img.astype(np.float32) / 128. - 1.
-            return norm_img
-        elif self.rec_algorithm == 'RFL':
-            img = cv2.cvtColor(img, cv2.COLOR_BGR2GRAY)
-            resized_image = cv2.resize(
-                img, (imgW, imgH), interpolation=cv2.INTER_CUBIC)
-            resized_image = resized_image.astype('float32')
-            resized_image = resized_image / 255
-            resized_image = resized_image[np.newaxis, :]
-            resized_image -= 0.5
-            resized_image /= 0.5
-            return resized_image
-
-        assert imgC == img.shape[2]
-        imgW = int((imgH * max_wh_ratio))
-        if self.use_onnx:
-            w = self.input_tensor.shape[3:][0]
-            if isinstance(w, str):
-                pass
-            elif w is not None and w > 0:
-                imgW = w
-        if self.use_rknn:
-            imgW = self.rec_image_shape[2]
-        h, w = img.shape[:2]
-        ratio = w / float(h)
-        if math.ceil(imgH * ratio) > imgW:
-            resized_w = imgW
-        else:
-            resized_w = int(math.ceil(imgH * ratio))
-        if self.rec_algorithm == 'RARE':
-            if resized_w > self.rec_image_shape[2]:
-                resized_w = self.rec_image_shape[2]
-            imgW = self.rec_image_shape[2]
-        resized_image = cv2.resize(img, (resized_w, imgH))
-        resized_image = resized_image.astype('float32')
-        resized_image = resized_image.transpose((2, 0, 1)) / 255
-        resized_image -= 0.5
-        resized_image /= 0.5
-        padding_im = np.zeros((imgC, imgH, imgW), dtype=np.float32)
-        padding_im[:, :, 0:resized_w] = resized_image
-        return padding_im
-
-    def resize_norm_img_vl(self, img, image_shape):
-
-        imgC, imgH, imgW = image_shape
-        img = img[:, :, ::-1]  # bgr2rgb
-        resized_image = cv2.resize(
-            img, (imgW, imgH), interpolation=cv2.INTER_LINEAR)
-        resized_image = resized_image.astype('float32')
-        resized_image = resized_image.transpose((2, 0, 1)) / 255
-        return resized_image
-
-    def resize_norm_img_srn(self, img, image_shape):
-        imgC, imgH, imgW = image_shape
-
-        img_black = np.zeros((imgH, imgW))
-        im_hei = img.shape[0]
-        im_wid = img.shape[1]
-
-        if im_wid <= im_hei * 1:
-            img_new = cv2.resize(img, (imgH * 1, imgH))
-        elif im_wid <= im_hei * 2:
-            img_new = cv2.resize(img, (imgH * 2, imgH))
-        elif im_wid <= im_hei * 3:
-            img_new = cv2.resize(img, (imgH * 3, imgH))
-        else:
-            img_new = cv2.resize(img, (imgW, imgH))
-
-        img_np = np.asarray(img_new)
-        img_np = cv2.cvtColor(img_np, cv2.COLOR_BGR2GRAY)
-        img_black[:, 0:img_np.shape[1]] = img_np
-        img_black = img_black[:, :, np.newaxis]
-
-        row, col, c = img_black.shape
-        c = 1
-
-        return np.reshape(img_black, (c, row, col)).astype(np.float32)
-
-    def srn_other_inputs(self, image_shape, num_heads, max_text_length):
-
-        imgC, imgH, imgW = image_shape
-        feature_dim = int((imgH / 8) * (imgW / 8))
-
-        encoder_word_pos = np.array(range(0, feature_dim)).reshape(
-            (feature_dim, 1)).astype('int64')
-        gsrm_word_pos = np.array(range(0, max_text_length)).reshape(
-            (max_text_length, 1)).astype('int64')
-
-        gsrm_attn_bias_data = np.ones((1, max_text_length, max_text_length))
-        gsrm_slf_attn_bias1 = np.triu(gsrm_attn_bias_data, 1).reshape(
-            [-1, 1, max_text_length, max_text_length])
-        gsrm_slf_attn_bias1 = np.tile(
-            gsrm_slf_attn_bias1,
-            [1, num_heads, 1, 1]).astype('float32') * [-1e9]
-
-        gsrm_slf_attn_bias2 = np.tril(gsrm_attn_bias_data, -1).reshape(
-            [-1, 1, max_text_length, max_text_length])
-        gsrm_slf_attn_bias2 = np.tile(
-            gsrm_slf_attn_bias2,
-            [1, num_heads, 1, 1]).astype('float32') * [-1e9]
-
-        encoder_word_pos = encoder_word_pos[np.newaxis, :]
-        gsrm_word_pos = gsrm_word_pos[np.newaxis, :]
-
-        return [
-            encoder_word_pos, gsrm_word_pos, gsrm_slf_attn_bias1,
-            gsrm_slf_attn_bias2
+        }
         ]
 
-    def process_image_srn(self, img, image_shape, num_heads, max_text_length):
-        norm_img = self.resize_norm_img_srn(img, image_shape)
-        norm_img = norm_img[np.newaxis, :]
-
-        [encoder_word_pos, gsrm_word_pos, gsrm_slf_attn_bias1, gsrm_slf_attn_bias2] = \
-            self.srn_other_inputs(image_shape, num_heads, max_text_length)
-
-        gsrm_slf_attn_bias1 = gsrm_slf_attn_bias1.astype(np.float32)
-        gsrm_slf_attn_bias2 = gsrm_slf_attn_bias2.astype(np.float32)
-        encoder_word_pos = encoder_word_pos.astype(np.int64)
-        gsrm_word_pos = gsrm_word_pos.astype(np.int64)
-
-        return (norm_img, encoder_word_pos, gsrm_word_pos, gsrm_slf_attn_bias1,
-                gsrm_slf_attn_bias2)
-
-    def resize_norm_img_sar(self, img, image_shape,
-                            width_downsample_ratio=0.25):
-        imgC, imgH, imgW_min, imgW_max = image_shape
-        h = img.shape[0]
-        w = img.shape[1]
-        valid_ratio = 1.0
-        # make sure new_width is an integral multiple of width_divisor.
-        width_divisor = int(1 / width_downsample_ratio)
-        # resize
-        ratio = w / float(h)
-        resize_w = math.ceil(imgH * ratio)
-        if resize_w % width_divisor != 0:
-            resize_w = round(resize_w / width_divisor) * width_divisor
-        if imgW_min is not None:
-            resize_w = max(imgW_min, resize_w)
-        if imgW_max is not None:
-            valid_ratio = min(1.0, 1.0 * resize_w / imgW_max)
-            resize_w = min(imgW_max, resize_w)
-        resized_image = cv2.resize(img, (resize_w, imgH))
-        resized_image = resized_image.astype('float32')
-        # norm
-        if image_shape[0] == 1:
-            resized_image = resized_image / 255
-            resized_image = resized_image[np.newaxis, :]
-        else:
-            resized_image = resized_image.transpose((2, 0, 1)) / 255
-        resized_image -= 0.5
-        resized_image /= 0.5
-        resize_shape = resized_image.shape
-        padding_im = -1.0 * np.ones((imgC, imgH, imgW_max), dtype=np.float32)
-        padding_im[:, :, 0:resize_w] = resized_image
-        pad_shape = padding_im.shape
-
-        return padding_im, resize_shape, pad_shape, valid_ratio
-
-    def resize_norm_img_spin(self, img):
-        img = cv2.cvtColor(img, cv2.COLOR_BGR2GRAY)
-        # return padding_im
-        img = cv2.resize(img, tuple([100, 32]), cv2.INTER_CUBIC)
-        img = np.array(img, np.float32)
-        img = np.expand_dims(img, -1)
-        img = img.transpose((2, 0, 1))
-        mean = [127.5]
-        std = [127.5]
-        mean = np.array(mean, dtype=np.float32)
-        std = np.array(std, dtype=np.float32)
-        mean = np.float32(mean.reshape(1, -1))
-        stdinv = 1 / np.float32(std.reshape(1, -1))
-        img -= mean
-        img *= stdinv
-        return img
-
-    def resize_norm_img_svtr(self, img, image_shape):
-
-        imgC, imgH, imgW = image_shape
-        resized_image = cv2.resize(
-            img, (imgW, imgH), interpolation=cv2.INTER_LINEAR)
-        resized_image = resized_image.astype('float32')
-        resized_image = resized_image.transpose((2, 0, 1)) / 255
-        resized_image -= 0.5
-        resized_image /= 0.5
-        return resized_image
-
-    def resize_norm_img_abinet(self, img, image_shape):
-
-        imgC, imgH, imgW = image_shape
-
-        resized_image = cv2.resize(
-            img, (imgW, imgH), interpolation=cv2.INTER_LINEAR)
-        resized_image = resized_image.astype('float32')
-        resized_image = resized_image / 255.
-
-        mean = np.array([0.485, 0.456, 0.406])
-        std = np.array([0.229, 0.224, 0.225])
-        resized_image = (
-            resized_image - mean[None, None, ...]) / std[None, None, ...]
-        resized_image = resized_image.transpose((2, 0, 1))
-        resized_image = resized_image.astype('float32')
-
-        return resized_image
-
-    def norm_img_can(self, img, image_shape):
-
-        img = cv2.cvtColor(
-            img, cv2.COLOR_BGR2GRAY)  # CAN only predict gray scale image
-
-        if self.inverse:
-            img = 255 - img
-
-        if self.rec_image_shape[0] == 1:
-            h, w = img.shape
-            _, imgH, imgW = self.rec_image_shape
-            if h < imgH or w < imgW:
-                padding_h = max(imgH - h, 0)
-                padding_w = max(imgW - w, 0)
-                img_padded = np.pad(img, ((0, padding_h), (0, padding_w)),
-                                    'constant',
-                                    constant_values=(255))
-                img = img_padded
-
-        img = np.expand_dims(img, 0) / 255.0  # h,w,c -> c,h,w
-        img = img.astype('float32')
-
+POSTPROCESS_CONFIG = {
+        'CTCLabelDecode':{
+            "character_dict_path": CHARACTER_DICT_PATH,
+            "use_space_char": True
+            }   
+        }
+class TextRecognizer:
+    def __init__(self, args) -> None:
+        self.model, self.framework = setup_model(args)
+        self.preprocess_funct = []
+        for item in PRE_PROCESS_CONFIG:
+            for key in item:
+                pclass = getattr(utils.operators, key)
+                p = pclass(**item[key])
+                self.preprocess_funct.append(p)
+
+        self.ctc_postprocess = CTCLabelDecode(**POSTPROCESS_CONFIG['CTCLabelDecode'])
+
+    def preprocess(self, img):
+        for p in self.preprocess_funct:
+            img = p(img)
+
+        if self.framework == 'onnx':
+            image_input = img['image']
+            image_input = image_input.reshape(1, *image_input.shape)
+            image_input = image_input.transpose(0, 3, 1, 2)
+            img['image'] = image_input
         return img
-
-    def release_rknn(self):
-        self.predictor.release()
-
-    def __call__(self, img_list):
-        img_num = len(img_list)
-        # Calculate the aspect ratio of all text bars
-        width_list = []
-        for img in img_list:
-            width_list.append(img.shape[1] / float(img.shape[0]))
-        # Sorting can speed up the recognition process
-        indices = np.argsort(np.array(width_list))
-        rec_res = [['', 0.0]] * img_num
-        batch_num = self.rec_batch_num
-        st = time.time()
-        if self.benchmark:
-            self.autolog.times.start()
-        for beg_img_no in range(0, img_num, batch_num):
-            end_img_no = min(img_num, beg_img_no + batch_num)
-            norm_img_batch = []
-            if self.rec_algorithm == "SRN":
-                encoder_word_pos_list = []
-                gsrm_word_pos_list = []
-                gsrm_slf_attn_bias1_list = []
-                gsrm_slf_attn_bias2_list = []
-            if self.rec_algorithm == "SAR":
-                valid_ratios = []
-            imgC, imgH, imgW = self.rec_image_shape[:3]
-            max_wh_ratio = imgW / imgH
-            # max_wh_ratio = 0
-            for ino in range(beg_img_no, end_img_no):
-                h, w = img_list[indices[ino]].shape[0:2]
-                wh_ratio = w * 1.0 / h
-                max_wh_ratio = max(max_wh_ratio, wh_ratio)
-            for ino in range(beg_img_no, end_img_no):
-                if self.rec_algorithm == "SAR":
-                    norm_img, _, _, valid_ratio = self.resize_norm_img_sar(
-                        img_list[indices[ino]], self.rec_image_shape)
-                    norm_img = norm_img[np.newaxis, :]
-                    valid_ratio = np.expand_dims(valid_ratio, axis=0)
-                    valid_ratios.append(valid_ratio)
-                    norm_img_batch.append(norm_img)
-                elif self.rec_algorithm == "SRN":
-                    norm_img = self.process_image_srn(
-                        img_list[indices[ino]], self.rec_image_shape, 8, 25)
-                    encoder_word_pos_list.append(norm_img[1])
-                    gsrm_word_pos_list.append(norm_img[2])
-                    gsrm_slf_attn_bias1_list.append(norm_img[3])
-                    gsrm_slf_attn_bias2_list.append(norm_img[4])
-                    norm_img_batch.append(norm_img[0])
-                elif self.rec_algorithm in ["SVTR", "SATRN"]:
-                    norm_img = self.resize_norm_img_svtr(img_list[indices[ino]],
-                                                         self.rec_image_shape)
-                    norm_img = norm_img[np.newaxis, :]
-                    norm_img_batch.append(norm_img)
-                elif self.rec_algorithm in ["VisionLAN", "PREN"]:
-                    norm_img = self.resize_norm_img_vl(img_list[indices[ino]],
-                                                       self.rec_image_shape)
-                    norm_img = norm_img[np.newaxis, :]
-                    norm_img_batch.append(norm_img)
-                elif self.rec_algorithm == 'SPIN':
-                    norm_img = self.resize_norm_img_spin(img_list[indices[ino]])
-                    norm_img = norm_img[np.newaxis, :]
-                    norm_img_batch.append(norm_img)
-                elif self.rec_algorithm == "ABINet":
-                    norm_img = self.resize_norm_img_abinet(
-                        img_list[indices[ino]], self.rec_image_shape)
-                    norm_img = norm_img[np.newaxis, :]
-                    norm_img_batch.append(norm_img)
-                elif self.rec_algorithm == "RobustScanner":
-                    norm_img, _, _, valid_ratio = self.resize_norm_img_sar(
-                        img_list[indices[ino]],
-                        self.rec_image_shape,
-                        width_downsample_ratio=0.25)
-                    norm_img = norm_img[np.newaxis, :]
-                    valid_ratio = np.expand_dims(valid_ratio, axis=0)
-                    valid_ratios = []
-                    valid_ratios.append(valid_ratio)
-                    norm_img_batch.append(norm_img)
-                    word_positions_list = []
-                    word_positions = np.array(range(0, 40)).astype('int64')
-                    word_positions = np.expand_dims(word_positions, axis=0)
-                    word_positions_list.append(word_positions)
-                elif self.rec_algorithm == "CAN":
-                    norm_img = self.norm_img_can(img_list[indices[ino]],
-                                                 max_wh_ratio)
-                    norm_img = norm_img[np.newaxis, :]
-                    norm_img_batch.append(norm_img)
-                    norm_image_mask = np.ones(norm_img.shape, dtype='float32')
-                    word_label = np.ones([1, 36], dtype='int64')
-                    norm_img_mask_batch = []
-                    word_label_list = []
-                    norm_img_mask_batch.append(norm_image_mask)
-                    word_label_list.append(word_label)
-                else:
-                    norm_img = self.resize_norm_img(img_list[indices[ino]],
-                                                    max_wh_ratio)
-                    norm_img = norm_img[np.newaxis, :]
-                    norm_img_batch.append(norm_img)
-            norm_img_batch = np.concatenate(norm_img_batch)
-            norm_img_batch = norm_img_batch.copy()
-            if self.benchmark:
-                self.autolog.times.stamp()
-
-            if self.rec_algorithm == "SRN":
-                encoder_word_pos_list = np.concatenate(encoder_word_pos_list)
-                gsrm_word_pos_list = np.concatenate(gsrm_word_pos_list)
-                gsrm_slf_attn_bias1_list = np.concatenate(
-                    gsrm_slf_attn_bias1_list)
-                gsrm_slf_attn_bias2_list = np.concatenate(
-                    gsrm_slf_attn_bias2_list)
-
-                inputs = [
-                    norm_img_batch,
-                    encoder_word_pos_list,
-                    gsrm_word_pos_list,
-                    gsrm_slf_attn_bias1_list,
-                    gsrm_slf_attn_bias2_list,
-                ]
-                if self.use_onnx:
-                    input_dict = {}
-                    input_dict[self.input_tensor.name] = norm_img_batch
-                    outputs = self.predictor.run(self.output_tensors,
-                                                 input_dict)
-                    preds = {"predict": outputs[2]}
-                else:
-                    input_names = self.predictor.get_input_names()
-                    for i in range(len(input_names)):
-                        input_tensor = self.predictor.get_input_handle(
-                            input_names[i])
-                        input_tensor.copy_from_cpu(inputs[i])
-                    self.predictor.run()
-                    outputs = []
-                    for output_tensor in self.output_tensors:
-                        output = output_tensor.copy_to_cpu()
-                        outputs.append(output)
-                    if self.benchmark:
-                        self.autolog.times.stamp()
-                    preds = {"predict": outputs[2]}
-            elif self.rec_algorithm == "SAR":
-                valid_ratios = np.concatenate(valid_ratios)
-                inputs = [
-                    norm_img_batch,
-                    np.array(
-                        [valid_ratios], dtype=np.float32),
-                ]
-                if self.use_onnx:
-                    input_dict = {}
-                    input_dict[self.input_tensor.name] = norm_img_batch
-                    outputs = self.predictor.run(self.output_tensors,
-                                                 input_dict)
-                    preds = outputs[0]
-                else:
-                    input_names = self.predictor.get_input_names()
-                    for i in range(len(input_names)):
-                        input_tensor = self.predictor.get_input_handle(
-                            input_names[i])
-                        input_tensor.copy_from_cpu(inputs[i])
-                    self.predictor.run()
-                    outputs = []
-                    for output_tensor in self.output_tensors:
-                        output = output_tensor.copy_to_cpu()
-                        outputs.append(output)
-                    if self.benchmark:
-                        self.autolog.times.stamp()
-                    preds = outputs[0]
-            elif self.rec_algorithm == "RobustScanner":
-                valid_ratios = np.concatenate(valid_ratios)
-                word_positions_list = np.concatenate(word_positions_list)
-                inputs = [norm_img_batch, valid_ratios, word_positions_list]
-
-                if self.use_onnx:
-                    input_dict = {}
-                    input_dict[self.input_tensor.name] = norm_img_batch
-                    outputs = self.predictor.run(self.output_tensors,
-                                                 input_dict)
-                    preds = outputs[0]
-                else:
-                    input_names = self.predictor.get_input_names()
-                    for i in range(len(input_names)):
-                        input_tensor = self.predictor.get_input_handle(
-                            input_names[i])
-                        input_tensor.copy_from_cpu(inputs[i])
-                    self.predictor.run()
-                    outputs = []
-                    for output_tensor in self.output_tensors:
-                        output = output_tensor.copy_to_cpu()
-                        outputs.append(output)
-                    if self.benchmark:
-                        self.autolog.times.stamp()
-                    preds = outputs[0]
-            elif self.rec_algorithm == "CAN":
-                norm_img_mask_batch = np.concatenate(norm_img_mask_batch)
-                word_label_list = np.concatenate(word_label_list)
-                inputs = [norm_img_batch, norm_img_mask_batch, word_label_list]
-                if self.use_onnx:
-                    input_dict = {}
-                    input_dict[self.input_tensor.name] = norm_img_batch
-                    outputs = self.predictor.run(self.output_tensors,
-                                                 input_dict)
-                    preds = outputs
-                else:
-                    input_names = self.predictor.get_input_names()
-                    input_tensor = []
-                    for i in range(len(input_names)):
-                        input_tensor_i = self.predictor.get_input_handle(
-                            input_names[i])
-                        input_tensor_i.copy_from_cpu(inputs[i])
-                        input_tensor.append(input_tensor_i)
-                    self.input_tensor = input_tensor
-                    self.predictor.run()
-                    outputs = []
-                    for output_tensor in self.output_tensors:
-                        output = output_tensor.copy_to_cpu()
-                        outputs.append(output)
-                    if self.benchmark:
-                        self.autolog.times.stamp()
-                    preds = outputs
-            else:
-                if self.use_onnx:
-                    input_dict = {}
-                    preds = []
-                    for idx in range(norm_img_batch.shape[0]):
-                        input_dict[self.input_tensor.name] = norm_img_batch[idx:idx+1]
-                        output = self.predictor.run(self.output_tensors,
-                                                    input_dict)
-                        preds.append(output[0])
-                    preds = np.concatenate(preds)
-                elif self.use_rknn:
-                    preds = []
-                    for idx in range(norm_img_batch.shape[0]):
-                        img = norm_img_batch[idx:idx+1]
-                        output = self.predictor.inference(inputs=[img], data_format=['nchw'])
-                        preds.append(output[0])
-                    preds = np.concatenate(preds)
-                else:
-                    self.input_tensor.copy_from_cpu(norm_img_batch)
-                    self.predictor.run()
-                    outputs = []
-                    for output_tensor in self.output_tensors:
-                        output = output_tensor.copy_to_cpu()
-                        outputs.append(output)
-                    if self.benchmark:
-                        self.autolog.times.stamp()
-                    if len(outputs) != 1:
-                        preds = outputs
-                    else:
-                        preds = outputs[0]
-            rec_result = self.postprocess_op(preds)
-            for rno in range(len(rec_result)):
-                rec_res[indices[beg_img_no + rno]] = rec_result[rno]
-            if self.benchmark:
-                self.autolog.times.end(stamp=True)
-        return rec_res, time.time() - st
-
-
-def main(args):
-    image_file_list = get_image_file_list(args.image_dir)
-    text_recognizer = TextRecognizer(args)
-    valid_image_file_list = []
-    img_list = []
-
-    logger.info(
-        "In PP-OCRv3, rec_image_shape parameter defaults to '3, 48, 320', "
-        "if you are using recognition model with PP-OCRv2 or an older version, please set --rec_image_shape='3,32,320"
-    )
-    # warmup 2 times
-    if args.warmup:
-        img = np.random.uniform(0, 255, [48, 320, 3]).astype(np.uint8)
-        for i in range(2):
-            res = text_recognizer([img] * int(args.rec_batch_num))
-
-    for image_file in image_file_list:
-        img, flag, _ = check_and_read(image_file)
-        if not flag:
-            img = cv2.imread(image_file)
-        if img is None:
-            logger.info("error in loading image:{}".format(image_file))
-            continue
-        valid_image_file_list.append(image_file)
-        img_list.append(img)
-    try:
-        rec_res, _ = text_recognizer(img_list)
-
-    except Exception as E:
-        logger.info(traceback.format_exc())
-        logger.info(E)
-        exit()
-    for ino in range(len(img_list)):
-        logger.info("Predicts of {}:{}".format(valid_image_file_list[ino],
-                                               rec_res[ino]))
-    if args.benchmark:
-        text_recognizer.autolog.report()
-    if args.use_rknn:
-        text_recognizer.release_rknn()
-
-
-if __name__ == "__main__":
-    main(utility.parse_args())
+    
+    def run(self, imgs):
+        outputs=[]
+        for img in imgs:
+            img = cv2.resize(img, (REC_INPUT_SHAPE[1], REC_INPUT_SHAPE[0]))
+            model_input = self.preprocess({'image':img})
+            output = self.model.run([model_input['image']])
+            preds = output[0].astype(np.float32)
+            output = self.ctc_postprocess(preds)
+            outputs.append(output)
+        return outputs
+
+def setup_model(args):
+    model_path = args.rec_model_path
+    if model_path.endswith('.rknn'):
+        platform = 'rknn'
+        from py_utils.rknn_executor import RKNN_model_container 
+        model = RKNN_model_container(model_path, args.target, args.device_id)
+    elif model_path.endswith('onnx'):
+        platform = 'onnx'
+        from py_utils.onnx_executor import ONNX_model_container
+        model = ONNX_model_container(model_path)
+    else:
+        assert False, "{} is not rknn/onnx model".format(model_path)
+    print('Model-{} is {} model, starting val'.format(model_path, platform))
+    return model, platform
\ No newline at end of file
diff --git a/examples/PPOCR/PPOCR-System/python/ppocr_system.py b/examples/PPOCR/PPOCR-System/python/ppocr_system.py
index b047032..f915b8e 100644
--- a/examples/PPOCR/PPOCR-System/python/ppocr_system.py
+++ b/examples/PPOCR/PPOCR-System/python/ppocr_system.py
@@ -13,110 +13,83 @@
 # limitations under the License.
 import os
 import sys
-import subprocess
-
-__dir__ = os.path.dirname(os.path.abspath(__file__))
-sys.path.append(__dir__)
-sys.path.insert(0, os.path.abspath(os.path.join(__dir__, '../..')))
-
-os.environ["FLAGS_allocator_strategy"] = 'auto_growth'
-
 import cv2
 import copy
 import numpy as np
-import json
-import time
-import logging
-from PIL import Image
-import utility as utility
+import argparse
 import ppocr_rec as predict_rec
 import ppocr_det as predict_det
-import ppocr_cls as predict_cls
-from paddleocr.ppocr.utils.utility import get_image_file_list, check_and_read
-from paddleocr.ppocr.utils.logging import get_logger
-from utility import draw_ocr_box_txt, get_rotate_crop_image, get_minarea_rect_crop
-logger = get_logger()
 
+os.environ["FLAGS_allocator_strategy"] = 'auto_growth'
+
+DET_INPUT_SHAPE = [480, 480] # h,w
 
 class TextSystem(object):
     def __init__(self, args):
-        if not args.show_log:
-            logger.setLevel(logging.INFO)
-
         self.text_detector = predict_det.TextDetector(args)
         self.text_recognizer = predict_rec.TextRecognizer(args)
-        self.use_angle_cls = args.use_angle_cls
-        self.drop_score = args.drop_score
-        if self.use_angle_cls:
-            self.text_classifier = predict_cls.TextClassifier(args)
+        self.drop_score = 0.5
 
-        self.args = args
-        self.crop_image_res_index = 0
-
-    def draw_crop_rec_res(self, output_dir, img_crop_list, rec_res):
-        os.makedirs(output_dir, exist_ok=True)
-        bbox_num = len(img_crop_list)
-        for bno in range(bbox_num):
-            cv2.imwrite(
-                os.path.join(output_dir,
-                             f"mg_crop_{bno+self.crop_image_res_index}.jpg"),
-                img_crop_list[bno])
-            logger.debug(f"{bno}, {rec_res[bno]}")
-        self.crop_image_res_index += bbox_num
-
-    def release_rknn(self):
-        self.text_detector.release_rknn()
-        self.text_recognizer.release_rknn()
-        if self.use_angle_cls:
-            self.text_classifier.release_rknn()
-        logger.info("release all rknn model.")
-
-    def __call__(self, img, cls=True):
-        time_dict = {'det': 0, 'rec': 0, 'csl': 0, 'all': 0}
-        start = time.time()
+    def run(self, img):
+        # 1. TextDetector
         ori_im = img.copy()
-        dt_boxes, elapse = self.text_detector(img)
-        time_dict['det'] = elapse
-        logger.debug("dt_boxes num : {}, elapse : {}".format(
-            len(dt_boxes), elapse))
+        dt_boxes = self.text_detector.run(img)
         if dt_boxes is None:
             return None, None
-        img_crop_list = []
 
+        img_crop_list = []
         dt_boxes = sorted_boxes(dt_boxes)
-
         for bno in range(len(dt_boxes)):
             tmp_box = copy.deepcopy(dt_boxes[bno])
-            if self.args.det_box_type == "quad":
-                img_crop = get_rotate_crop_image(ori_im, tmp_box)
-            else:
-                img_crop, box = get_minarea_rect_crop(ori_im, tmp_box)
-                dt_boxes[bno] = np.array(box)
+            img_crop = get_rotate_crop_image(ori_im, tmp_box)
             img_crop_list.append(img_crop)
-        if self.use_angle_cls and cls:
-            img_crop_list, angle_list, elapse = self.text_classifier(
-                img_crop_list)
-            time_dict['cls'] = elapse
-            logger.debug("cls num  : {}, elapse : {}".format(
-                len(img_crop_list), elapse))
 
-        rec_res, elapse = self.text_recognizer(img_crop_list)
-        time_dict['rec'] = elapse
-        logger.debug("rec_res num  : {}, elapse : {}".format(
-            len(rec_res), elapse))
-        if self.args.save_crop_res:
-            self.draw_crop_rec_res(self.args.crop_res_save_dir, img_crop_list,
-                                   rec_res)
+        # 2. TextRecognizer
+        rec_res = self.text_recognizer.run(img_crop_list)
+        
+        # 3. Filter
         filter_boxes, filter_rec_res = [], []
         for box, rec_result in zip(dt_boxes, rec_res):
-            text, score = rec_result
+            text, score = rec_result[0]
             if score >= self.drop_score:
                 filter_boxes.append(box)
                 filter_rec_res.append(rec_result)
-        end = time.time()
-        time_dict['all'] = end - start
-        return filter_boxes, filter_rec_res, time_dict
 
+        return filter_boxes, filter_rec_res
+
+def get_rotate_crop_image(img, points):
+    '''
+    img_height, img_width = img.shape[0:2]
+    left = int(np.min(points[:, 0]))
+    right = int(np.max(points[:, 0]))
+    top = int(np.min(points[:, 1]))
+    bottom = int(np.max(points[:, 1]))
+    img_crop = img[top:bottom, left:right, :].copy()
+    points[:, 0] = points[:, 0] - left
+    points[:, 1] = points[:, 1] - top
+    '''
+    assert len(points) == 4, "shape of points must be 4*2"
+    img_crop_width = int(
+        max(
+            np.linalg.norm(points[0] - points[1]),
+            np.linalg.norm(points[2] - points[3])))
+    img_crop_height = int(
+        max(
+            np.linalg.norm(points[0] - points[3]),
+            np.linalg.norm(points[1] - points[2])))
+    pts_std = np.float32([[0, 0], [img_crop_width, 0],
+                          [img_crop_width, img_crop_height],
+                          [0, img_crop_height]])
+    M = cv2.getPerspectiveTransform(points, pts_std)
+    dst_img = cv2.warpPerspective(
+        img,
+        M, (img_crop_width, img_crop_height),
+        borderMode=cv2.BORDER_REPLICATE,
+        flags=cv2.INTER_CUBIC)
+    dst_img_height, dst_img_width = dst_img.shape[0:2]
+    if dst_img_height * 1.0 / dst_img_width >= 1.5:
+        dst_img = np.rot90(dst_img)
+    return dst_img
 
 def sorted_boxes(dt_boxes):
     """
@@ -142,131 +115,28 @@ def sorted_boxes(dt_boxes):
     return _boxes
 
 
-def main(args):
-    image_file_list = get_image_file_list(args.image_dir)
-    image_file_list = image_file_list[args.process_id::args.total_process_num]
-    text_sys = TextSystem(args)
-    is_visualize = True
-    font_path = args.vis_font_path
-    drop_score = args.drop_score
-    draw_img_save_dir = args.draw_img_save_dir
-    os.makedirs(draw_img_save_dir, exist_ok=True)
-    save_results = []
-
-    logger.info(
-        "In PP-OCRv3, rec_image_shape parameter defaults to '3, 48, 320', "
-        "if you are using recognition model with PP-OCRv2 or an older version, please set --rec_image_shape='3,32,320"
-    )
-
-    # warm up 10 times
-    if args.warmup:
-        img = np.random.uniform(0, 255, [640, 640, 3]).astype(np.uint8)
-        for i in range(10):
-            res = text_sys(img)
-
-    total_time = 0
-    cpu_mem, gpu_mem, gpu_util = 0, 0, 0
-    _st = time.time()
-    count = 0
-    for idx, image_file in enumerate(image_file_list):
-
-        img, flag_gif, flag_pdf = check_and_read(image_file)
-        if not flag_gif and not flag_pdf:
-            img = cv2.imread(image_file)
-        if not flag_pdf:
-            if img is None:
-                logger.debug("error in loading image:{}".format(image_file))
-                continue
-            imgs = [img]
-        else:
-            page_num = args.page_num
-            if page_num > len(img) or page_num == 0:
-                page_num = len(img)
-            imgs = img[:page_num]
-        for index, img in enumerate(imgs):
-            starttime = time.time()
-            dt_boxes, rec_res, time_dict = text_sys(img)
-            elapse = time.time() - starttime
-            total_time += elapse
-            if len(imgs) > 1:
-                logger.debug(
-                    str(idx) + '_' + str(index) + "  Predict time of %s: %.3fs"
-                    % (image_file, elapse))
-            else:
-                logger.debug(
-                    str(idx) + "  Predict time of %s: %.3fs" % (image_file,
-                                                                elapse))
-            for text, score in rec_res:
-                logger.debug("{}, {:.3f}".format(text, score))
-
-            res = [{
-                "transcription": rec_res[i][0],
-                "points": np.array(dt_boxes[i]).astype(np.int32).tolist(),
-            } for i in range(len(dt_boxes))]
-            if len(imgs) > 1:
-                save_pred = os.path.basename(image_file) + '_' + str(
-                    index) + "\t" + json.dumps(
-                        res, ensure_ascii=False) + "\n"
-            else:
-                save_pred = os.path.basename(image_file) + "\t" + json.dumps(
-                    res, ensure_ascii=False) + "\n"
-            save_results.append(save_pred)
-
-            if is_visualize:
-                image = Image.fromarray(cv2.cvtColor(img, cv2.COLOR_BGR2RGB))
-                boxes = dt_boxes
-                txts = [rec_res[i][0] for i in range(len(rec_res))]
-                scores = [rec_res[i][1] for i in range(len(rec_res))]
-
-                draw_img = draw_ocr_box_txt(
-                    image,
-                    boxes,
-                    txts,
-                    scores,
-                    drop_score=drop_score,
-                    font_path=font_path)
-                if flag_gif:
-                    save_file = image_file[:-3] + "png"
-                elif flag_pdf:
-                    save_file = image_file.replace('.pdf',
-                                                   '_' + str(index) + '.png')
-                else:
-                    save_file = image_file
-                cv2.imwrite(
-                    os.path.join(draw_img_save_dir,
-                                 os.path.basename(save_file)),
-                    draw_img[:, :, ::-1])
-                logger.debug("The visualized image saved in {}".format(
-                    os.path.join(draw_img_save_dir, os.path.basename(
-                        save_file))))
-    if args.use_rknn:
-        text_sys.release_rknn()
-
-    logger.info("The predict total time is {}".format(time.time() - _st))
-    if args.benchmark:
-        text_sys.text_detector.autolog.report()
-        text_sys.text_recognizer.autolog.report()
-
-    with open(
-            os.path.join(draw_img_save_dir, "system_results.txt"),
-            'w',
-            encoding='utf-8') as f:
-        f.writelines(save_results)
-
-
-if __name__ == "__main__":
-    args = utility.parse_args()
-    if args.use_mp:
-        p_list = []
-        total_process_num = args.total_process_num
-        for process_id in range(total_process_num):
-            cmd = [sys.executable, "-u"] + sys.argv + [
-                "--process_id={}".format(process_id),
-                "--use_mp={}".format(False)
-            ]
-            p = subprocess.Popen(cmd, stdout=sys.stdout, stderr=sys.stdout)
-            p_list.append(p)
-        for p in p_list:
-            p.wait()
-    else:
-        main(args)
+def init_args():
+    parser = argparse.ArgumentParser(description='PPOCR-System Python Demo')
+    # basic params
+    parser.add_argument('--det_model_path', type=str, required= True, help='model path, could be .onnx or .rknn file')
+    parser.add_argument('--rec_model_path', type=str, required= True, help='model path, could be .onnx or .rknn file')
+    parser.add_argument('--target', type=str, default='rk3566', help='target RKNPU platform')
+    parser.add_argument('--device_id', type=str, default=None, help='device id')
+    # parser.add_argument('--vis_font_path', type=str, default='../model/simfang.ttf', help='vis font path')
+    return parser
+
+if __name__ == '__main__':
+    # Init model
+    parser = init_args()
+    args =  parser.parse_args()
+    system_model = TextSystem(args)
+    
+    # Set inputs
+    img_path = '../model/test.jpg'
+    img = cv2.imread(img_path)
+    img = cv2.resize(img, (DET_INPUT_SHAPE[1], DET_INPUT_SHAPE[0]))
+
+    # Inference
+    filter_boxes, filter_rec_res = system_model.run(img)
+
+    print(filter_rec_res)
\ No newline at end of file
diff --git a/examples/PPOCR/PPOCR-System/python/requirements.txt b/examples/PPOCR/PPOCR-System/python/requirements.txt
deleted file mode 100644
index 4c587ce..0000000
--- a/examples/PPOCR/PPOCR-System/python/requirements.txt
+++ /dev/null
@@ -1 +0,0 @@
-paddleocr==2.6
\ No newline at end of file
diff --git a/examples/PPOCR/PPOCR-System/python/utility.py b/examples/PPOCR/PPOCR-System/python/utility.py
deleted file mode 100644
index 0a71871..0000000
--- a/examples/PPOCR/PPOCR-System/python/utility.py
+++ /dev/null
@@ -1,777 +0,0 @@
-# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-import argparse
-import os
-import sys
-import platform
-import cv2
-import numpy as np
-import paddle
-from PIL import Image, ImageDraw, ImageFont
-import math
-from paddle import inference
-import time
-import random
-import logging
-
-
-def str2bool(v):
-    return v.lower() in ("true", "t", "1")
-
-
-def init_args():
-    parser = argparse.ArgumentParser()
-    # params for prediction engine
-    parser.add_argument("--use_gpu", type=str2bool, default=True)
-    parser.add_argument("--use_xpu", type=str2bool, default=False)
-    parser.add_argument("--use_npu", type=str2bool, default=False)
-    parser.add_argument("--ir_optim", type=str2bool, default=True)
-    parser.add_argument("--use_tensorrt", type=str2bool, default=False)
-    parser.add_argument("--min_subgraph_size", type=int, default=15)
-    parser.add_argument("--precision", type=str, default="fp32")
-    parser.add_argument("--gpu_mem", type=int, default=500)
-    parser.add_argument("--gpu_id", type=int, default=0)
-
-    # params for text detector
-    parser.add_argument("--image_dir", type=str)
-    parser.add_argument("--page_num", type=int, default=0)
-    parser.add_argument("--det_algorithm", type=str, default='DB')
-    parser.add_argument("--det_model_dir", type=str)
-    parser.add_argument("--det_limit_side_len", type=float, default=960)
-    parser.add_argument("--det_image_shape", type=int, nargs='+', default=[960, 960], help="[h, w]")
-    parser.add_argument("--det_limit_type", type=str, default='max')
-    parser.add_argument("--det_box_type", type=str, default='quad')
-
-    # DB parmas
-    parser.add_argument("--det_db_thresh", type=float, default=0.3)
-    parser.add_argument("--det_db_box_thresh", type=float, default=0.6)
-    parser.add_argument("--det_db_unclip_ratio", type=float, default=1.5)
-    parser.add_argument("--max_batch_size", type=int, default=10)
-    parser.add_argument("--use_dilation", type=str2bool, default=False)
-    parser.add_argument("--det_db_score_mode", type=str, default="fast")
-
-    # EAST parmas
-    parser.add_argument("--det_east_score_thresh", type=float, default=0.8)
-    parser.add_argument("--det_east_cover_thresh", type=float, default=0.1)
-    parser.add_argument("--det_east_nms_thresh", type=float, default=0.2)
-
-    # SAST parmas
-    parser.add_argument("--det_sast_score_thresh", type=float, default=0.5)
-    parser.add_argument("--det_sast_nms_thresh", type=float, default=0.2)
-
-    # PSE parmas
-    parser.add_argument("--det_pse_thresh", type=float, default=0)
-    parser.add_argument("--det_pse_box_thresh", type=float, default=0.85)
-    parser.add_argument("--det_pse_min_area", type=float, default=16)
-    parser.add_argument("--det_pse_scale", type=int, default=1)
-
-    # FCE parmas
-    parser.add_argument("--scales", type=list, default=[8, 16, 32])
-    parser.add_argument("--alpha", type=float, default=1.0)
-    parser.add_argument("--beta", type=float, default=1.0)
-    parser.add_argument("--fourier_degree", type=int, default=5)
-
-    # params for text recognizer
-    parser.add_argument("--rec_algorithm", type=str, default='SVTR_LCNet')
-    parser.add_argument("--rec_model_dir", type=str)
-    parser.add_argument("--rec_image_inverse", type=str2bool, default=True)
-    parser.add_argument("--rec_image_shape", type=str, default="3, 48, 320")
-    parser.add_argument("--rec_batch_num", type=int, default=6)
-    parser.add_argument("--max_text_length", type=int, default=25)
-    parser.add_argument(
-        "--rec_char_dict_path",
-        type=str,
-        default="./ppocr/utils/ppocr_keys_v1.txt")
-    parser.add_argument("--use_space_char", type=str2bool, default=True)
-    parser.add_argument(
-        "--vis_font_path", type=str, default="./doc/fonts/simfang.ttf")
-    parser.add_argument("--drop_score", type=float, default=0.5)
-
-    # params for e2e
-    parser.add_argument("--e2e_algorithm", type=str, default='PGNet')
-    parser.add_argument("--e2e_model_dir", type=str)
-    parser.add_argument("--e2e_limit_side_len", type=float, default=768)
-    parser.add_argument("--e2e_limit_type", type=str, default='max')
-
-    # PGNet parmas
-    parser.add_argument("--e2e_pgnet_score_thresh", type=float, default=0.5)
-    parser.add_argument(
-        "--e2e_char_dict_path", type=str, default="./ppocr/utils/ic15_dict.txt")
-    parser.add_argument("--e2e_pgnet_valid_set", type=str, default='totaltext')
-    parser.add_argument("--e2e_pgnet_mode", type=str, default='fast')
-
-    # params for text classifier
-    parser.add_argument("--use_angle_cls", type=str2bool, default=False)
-    parser.add_argument("--cls_model_dir", type=str)
-    parser.add_argument("--cls_image_shape", type=str, default="3, 48, 192")
-    parser.add_argument("--label_list", type=list, default=['0', '180'])
-    parser.add_argument("--cls_batch_num", type=int, default=6)
-    parser.add_argument("--cls_thresh", type=float, default=0.9)
-
-    parser.add_argument("--enable_mkldnn", type=str2bool, default=False)
-    parser.add_argument("--cpu_threads", type=int, default=10)
-    parser.add_argument("--use_pdserving", type=str2bool, default=False)
-    parser.add_argument("--warmup", type=str2bool, default=False)
-
-    # SR parmas
-    parser.add_argument("--sr_model_dir", type=str)
-    parser.add_argument("--sr_image_shape", type=str, default="3, 32, 128")
-    parser.add_argument("--sr_batch_num", type=int, default=1)
-
-    #
-    parser.add_argument(
-        "--draw_img_save_dir", type=str, default="./inference_results")
-    parser.add_argument("--save_crop_res", type=str2bool, default=False)
-    parser.add_argument("--crop_res_save_dir", type=str, default="./output")
-
-    # multi-process
-    parser.add_argument("--use_mp", type=str2bool, default=False)
-    parser.add_argument("--total_process_num", type=int, default=1)
-    parser.add_argument("--process_id", type=int, default=0)
-
-    parser.add_argument("--benchmark", type=str2bool, default=False)
-    parser.add_argument("--save_log_path", type=str, default="./log_output/")
-
-    parser.add_argument("--show_log", type=str2bool, default=True)
-    parser.add_argument("--use_onnx", type=str2bool, default=False)
-    parser.add_argument("--use_rknn", type=str2bool, default=False)
-    parser.add_argument("--platform", type=str, default="rk3568")
-    return parser
-
-
-def parse_args():
-    parser = init_args()
-    return parser.parse_args()
-
-
-def create_predictor(args, mode, logger):
-    if mode == "det":
-        model_dir = args.det_model_dir
-    elif mode == 'cls':
-        model_dir = args.cls_model_dir
-    elif mode == 'rec':
-        model_dir = args.rec_model_dir
-    elif mode == 'table':
-        model_dir = args.table_model_dir
-    elif mode == 'ser':
-        model_dir = args.ser_model_dir
-    elif mode == 're':
-        model_dir = args.re_model_dir
-    elif mode == "sr":
-        model_dir = args.sr_model_dir
-    elif mode == 'layout':
-        model_dir = args.layout_model_dir
-    else:
-        model_dir = args.e2e_model_dir
-
-    if model_dir is None:
-        logger.info("not find {} model file path {}".format(mode, model_dir))
-        sys.exit(0)
-    if args.use_onnx:
-        import onnxruntime as ort
-        model_file_path = model_dir
-        if not os.path.exists(model_file_path):
-            raise ValueError("not find model file path {}".format(
-                model_file_path))
-        sess = ort.InferenceSession(model_file_path)
-        return sess, sess.get_inputs()[0], None, None
-    elif args.use_rknn:
-        from rknn.api import RKNN
-        rknn = RKNN()
-        print('--> Load rknn model')
-        model_file_path = model_dir
-        if not os.path.exists(model_file_path):
-            raise ValueError("not find model file path {}".format(
-                model_file_path))
-        ret = rknn.load_rknn(model_file_path)
-        if ret != 0:
-            print('Load rknn model failed!')
-            exit(ret)
-        print('done')
-        print('--> Init runtime environment')
-        # ret = rknn.init_runtime()
-        ret = rknn.init_runtime(args.platform)
-        if ret != 0:
-            print('Init runtime environment failed!')
-            exit(ret)
-        print('done')
-        return rknn, None, None, None
-    else:
-        file_names = ['model', 'inference']
-        for file_name in file_names:
-            model_file_path = '{}/{}.pdmodel'.format(model_dir, file_name)
-            params_file_path = '{}/{}.pdiparams'.format(model_dir, file_name)
-            if os.path.exists(model_file_path) and os.path.exists(
-                    params_file_path):
-                break
-        if not os.path.exists(model_file_path):
-            raise ValueError(
-                "not find model.pdmodel or inference.pdmodel in {}".format(
-                    model_dir))
-        if not os.path.exists(params_file_path):
-            raise ValueError(
-                "not find model.pdiparams or inference.pdiparams in {}".format(
-                    model_dir))
-
-        config = inference.Config(model_file_path, params_file_path)
-
-        if hasattr(args, 'precision'):
-            if args.precision == "fp16" and args.use_tensorrt:
-                precision = inference.PrecisionType.Half
-            elif args.precision == "int8":
-                precision = inference.PrecisionType.Int8
-            else:
-                precision = inference.PrecisionType.Float32
-        else:
-            precision = inference.PrecisionType.Float32
-
-        if args.use_gpu:
-            gpu_id = get_infer_gpuid()
-            if gpu_id is None:
-                logger.warning(
-                    "GPU is not found in current device by nvidia-smi. Please check your device or ignore it if run on jetson."
-                )
-            config.enable_use_gpu(args.gpu_mem, args.gpu_id)
-            if args.use_tensorrt:
-                config.enable_tensorrt_engine(
-                    workspace_size=1 << 30,
-                    precision_mode=precision,
-                    max_batch_size=args.max_batch_size,
-                    min_subgraph_size=args.
-                    min_subgraph_size,  # skip the minmum trt subgraph
-                    use_calib_mode=False)
-
-                # collect shape
-                trt_shape_f = os.path.join(model_dir,
-                                           f"{mode}_trt_dynamic_shape.txt")
-
-                if not os.path.exists(trt_shape_f):
-                    config.collect_shape_range_info(trt_shape_f)
-                    logger.info(
-                        f"collect dynamic shape info into : {trt_shape_f}")
-                try:
-                    config.enable_tuned_tensorrt_dynamic_shape(trt_shape_f,
-                                                               True)
-                except Exception as E:
-                    logger.info(E)
-                    logger.info("Please keep your paddlepaddle-gpu >= 2.3.0!")
-
-        elif args.use_npu:
-            config.enable_custom_device("npu")
-        elif args.use_xpu:
-            config.enable_xpu(10 * 1024 * 1024)
-        else:
-            config.disable_gpu()
-            if args.enable_mkldnn:
-                # cache 10 different shapes for mkldnn to avoid memory leak
-                config.set_mkldnn_cache_capacity(10)
-                config.enable_mkldnn()
-                if args.precision == "fp16":
-                    config.enable_mkldnn_bfloat16()
-                if hasattr(args, "cpu_threads"):
-                    config.set_cpu_math_library_num_threads(args.cpu_threads)
-                else:
-                    # default cpu threads as 10
-                    config.set_cpu_math_library_num_threads(10)
-        # enable memory optim
-        config.enable_memory_optim()
-        config.disable_glog_info()
-        config.delete_pass("conv_transpose_eltwiseadd_bn_fuse_pass")
-        config.delete_pass("matmul_transpose_reshape_fuse_pass")
-        if mode == 're':
-            config.delete_pass("simplify_with_basic_ops_pass")
-        if mode == 'table':
-            config.delete_pass("fc_fuse_pass")  # not supported for table
-        config.switch_use_feed_fetch_ops(False)
-        config.switch_ir_optim(True)
-
-        # create predictor
-        predictor = inference.create_predictor(config)
-        input_names = predictor.get_input_names()
-        if mode in ['ser', 're']:
-            input_tensor = []
-            for name in input_names:
-                input_tensor.append(predictor.get_input_handle(name))
-        else:
-            for name in input_names:
-                input_tensor = predictor.get_input_handle(name)
-        output_tensors = get_output_tensors(args, mode, predictor)
-        return predictor, input_tensor, output_tensors, config
-
-
-def get_output_tensors(args, mode, predictor):
-    output_names = predictor.get_output_names()
-    output_tensors = []
-    if mode == "rec" and args.rec_algorithm in [
-            "CRNN", "SVTR_LCNet", "SVTR_HGNet"
-    ]:
-        output_name = 'softmax_0.tmp_0'
-        if output_name in output_names:
-            return [predictor.get_output_handle(output_name)]
-        else:
-            for output_name in output_names:
-                output_tensor = predictor.get_output_handle(output_name)
-                output_tensors.append(output_tensor)
-    else:
-        for output_name in output_names:
-            output_tensor = predictor.get_output_handle(output_name)
-            output_tensors.append(output_tensor)
-    return output_tensors
-
-
-def get_infer_gpuid():
-    sysstr = platform.system()
-    if sysstr == "Windows":
-        return 0
-
-    if not paddle.device.is_compiled_with_rocm:
-        cmd = "env | grep CUDA_VISIBLE_DEVICES"
-    else:
-        cmd = "env | grep HIP_VISIBLE_DEVICES"
-    env_cuda = os.popen(cmd).readlines()
-    if len(env_cuda) == 0:
-        return 0
-    else:
-        gpu_id = env_cuda[0].strip().split("=")[1]
-        return int(gpu_id[0])
-
-
-def draw_e2e_res(dt_boxes, strs, img_path):
-    src_im = cv2.imread(img_path)
-    for box, str in zip(dt_boxes, strs):
-        box = box.astype(np.int32).reshape((-1, 1, 2))
-        cv2.polylines(src_im, [box], True, color=(255, 255, 0), thickness=2)
-        cv2.putText(
-            src_im,
-            str,
-            org=(int(box[0, 0, 0]), int(box[0, 0, 1])),
-            fontFace=cv2.FONT_HERSHEY_COMPLEX,
-            fontScale=0.7,
-            color=(0, 255, 0),
-            thickness=1)
-    return src_im
-
-
-def draw_text_det_res(dt_boxes, img):
-    for box in dt_boxes:
-        box = np.array(box).astype(np.int32).reshape(-1, 2)
-        cv2.polylines(img, [box], True, color=(255, 255, 0), thickness=2)
-    return img
-
-
-def resize_img(img, input_size=600):
-    """
-    resize img and limit the longest side of the image to input_size
-    """
-    img = np.array(img)
-    im_shape = img.shape
-    im_size_max = np.max(im_shape[0:2])
-    im_scale = float(input_size) / float(im_size_max)
-    img = cv2.resize(img, None, None, fx=im_scale, fy=im_scale)
-    return img
-
-
-def draw_ocr(image,
-             boxes,
-             txts=None,
-             scores=None,
-             drop_score=0.5,
-             font_path="./doc/fonts/simfang.ttf"):
-    """
-    Visualize the results of OCR detection and recognition
-    args:
-        image(Image|array): RGB image
-        boxes(list): boxes with shape(N, 4, 2)
-        txts(list): the texts
-        scores(list): txxs corresponding scores
-        drop_score(float): only scores greater than drop_threshold will be visualized
-        font_path: the path of font which is used to draw text
-    return(array):
-        the visualized img
-    """
-    if scores is None:
-        scores = [1] * len(boxes)
-    box_num = len(boxes)
-    for i in range(box_num):
-        if scores is not None and (scores[i] < drop_score or
-                                   math.isnan(scores[i])):
-            continue
-        box = np.reshape(np.array(boxes[i]), [-1, 1, 2]).astype(np.int64)
-        image = cv2.polylines(np.array(image), [box], True, (255, 0, 0), 2)
-    if txts is not None:
-        img = np.array(resize_img(image, input_size=600))
-        txt_img = text_visual(
-            txts,
-            scores,
-            img_h=img.shape[0],
-            img_w=600,
-            threshold=drop_score,
-            font_path=font_path)
-        img = np.concatenate([np.array(img), np.array(txt_img)], axis=1)
-        return img
-    return image
-
-
-def draw_ocr_box_txt(image,
-                     boxes,
-                     txts=None,
-                     scores=None,
-                     drop_score=0.5,
-                     font_path="./doc/fonts/simfang.ttf"):
-    h, w = image.height, image.width
-    img_left = image.copy()
-    img_right = np.ones((h, w, 3), dtype=np.uint8) * 255
-    random.seed(0)
-
-    draw_left = ImageDraw.Draw(img_left)
-    if txts is None or len(txts) != len(boxes):
-        txts = [None] * len(boxes)
-    for idx, (box, txt) in enumerate(zip(boxes, txts)):
-        if scores is not None and scores[idx] < drop_score:
-            continue
-        color = (random.randint(0, 255), random.randint(0, 255),
-                 random.randint(0, 255))
-        draw_left.polygon(box, fill=color)
-        img_right_text = draw_box_txt_fine((w, h), box, txt, font_path)
-        pts = np.array(box, np.int32).reshape((-1, 1, 2))
-        cv2.polylines(img_right_text, [pts], True, color, 1)
-        img_right = cv2.bitwise_and(img_right, img_right_text)
-    img_left = Image.blend(image, img_left, 0.5)
-    img_show = Image.new('RGB', (w * 2, h), (255, 255, 255))
-    img_show.paste(img_left, (0, 0, w, h))
-    img_show.paste(Image.fromarray(img_right), (w, 0, w * 2, h))
-    return np.array(img_show)
-
-
-def draw_box_txt_fine(img_size, box, txt, font_path="./doc/fonts/simfang.ttf"):
-    box_height = int(
-        math.sqrt((box[0][0] - box[3][0])**2 + (box[0][1] - box[3][1])**2))
-    box_width = int(
-        math.sqrt((box[0][0] - box[1][0])**2 + (box[0][1] - box[1][1])**2))
-
-    if box_height > 2 * box_width and box_height > 30:
-        img_text = Image.new('RGB', (box_height, box_width), (255, 255, 255))
-        draw_text = ImageDraw.Draw(img_text)
-        if txt:
-            font = create_font(txt, (box_height, box_width), font_path)
-            draw_text.text([0, 0], txt, fill=(0, 0, 0), font=font)
-        img_text = img_text.transpose(Image.ROTATE_270)
-    else:
-        img_text = Image.new('RGB', (box_width, box_height), (255, 255, 255))
-        draw_text = ImageDraw.Draw(img_text)
-        if txt:
-            font = create_font(txt, (box_width, box_height), font_path)
-            draw_text.text([0, 0], txt, fill=(0, 0, 0), font=font)
-
-    pts1 = np.float32(
-        [[0, 0], [box_width, 0], [box_width, box_height], [0, box_height]])
-    pts2 = np.array(box, dtype=np.float32)
-    M = cv2.getPerspectiveTransform(pts1, pts2)
-
-    img_text = np.array(img_text, dtype=np.uint8)
-    img_right_text = cv2.warpPerspective(
-        img_text,
-        M,
-        img_size,
-        flags=cv2.INTER_NEAREST,
-        borderMode=cv2.BORDER_CONSTANT,
-        borderValue=(255, 255, 255))
-    return img_right_text
-
-
-def create_font(txt, sz, font_path="./doc/fonts/simfang.ttf"):
-    font_size = int(sz[1] * 0.99)
-    font = ImageFont.truetype(font_path, font_size, encoding="utf-8")
-    length = font.getlength(txt)
-    if length > sz[0]:
-        font_size = int(font_size * sz[0] / length)
-        font = ImageFont.truetype(font_path, font_size, encoding="utf-8")
-    return font
-
-
-def str_count(s):
-    """
-    Count the number of Chinese characters,
-    a single English character and a single number
-    equal to half the length of Chinese characters.
-    args:
-        s(string): the input of string
-    return(int):
-        the number of Chinese characters
-    """
-    import string
-    count_zh = count_pu = 0
-    s_len = len(s)
-    en_dg_count = 0
-    for c in s:
-        if c in string.ascii_letters or c.isdigit() or c.isspace():
-            en_dg_count += 1
-        elif c.isalpha():
-            count_zh += 1
-        else:
-            count_pu += 1
-    return s_len - math.ceil(en_dg_count / 2)
-
-
-def text_visual(texts,
-                scores,
-                img_h=400,
-                img_w=600,
-                threshold=0.,
-                font_path="./doc/simfang.ttf"):
-    """
-    create new blank img and draw txt on it
-    args:
-        texts(list): the text will be draw
-        scores(list|None): corresponding score of each txt
-        img_h(int): the height of blank img
-        img_w(int): the width of blank img
-        font_path: the path of font which is used to draw text
-    return(array):
-    """
-    if scores is not None:
-        assert len(texts) == len(
-            scores), "The number of txts and corresponding scores must match"
-
-    def create_blank_img():
-        blank_img = np.ones(shape=[img_h, img_w], dtype=np.int8) * 255
-        blank_img[:, img_w - 1:] = 0
-        blank_img = Image.fromarray(blank_img).convert("RGB")
-        draw_txt = ImageDraw.Draw(blank_img)
-        return blank_img, draw_txt
-
-    blank_img, draw_txt = create_blank_img()
-
-    font_size = 20
-    txt_color = (0, 0, 0)
-    font = ImageFont.truetype(font_path, font_size, encoding="utf-8")
-
-    gap = font_size + 5
-    txt_img_list = []
-    count, index = 1, 0
-    for idx, txt in enumerate(texts):
-        index += 1
-        if scores[idx] < threshold or math.isnan(scores[idx]):
-            index -= 1
-            continue
-        first_line = True
-        while str_count(txt) >= img_w // font_size - 4:
-            tmp = txt
-            txt = tmp[:img_w // font_size - 4]
-            if first_line:
-                new_txt = str(index) + ': ' + txt
-                first_line = False
-            else:
-                new_txt = '    ' + txt
-            draw_txt.text((0, gap * count), new_txt, txt_color, font=font)
-            txt = tmp[img_w // font_size - 4:]
-            if count >= img_h // gap - 1:
-                txt_img_list.append(np.array(blank_img))
-                blank_img, draw_txt = create_blank_img()
-                count = 0
-            count += 1
-        if first_line:
-            new_txt = str(index) + ': ' + txt + '   ' + '%.3f' % (scores[idx])
-        else:
-            new_txt = "  " + txt + "  " + '%.3f' % (scores[idx])
-        draw_txt.text((0, gap * count), new_txt, txt_color, font=font)
-        # whether add new blank img or not
-        if count >= img_h // gap - 1 and idx + 1 < len(texts):
-            txt_img_list.append(np.array(blank_img))
-            blank_img, draw_txt = create_blank_img()
-            count = 0
-        count += 1
-    txt_img_list.append(np.array(blank_img))
-    if len(txt_img_list) == 1:
-        blank_img = np.array(txt_img_list[0])
-    else:
-        blank_img = np.concatenate(txt_img_list, axis=1)
-    return np.array(blank_img)
-
-
-def base64_to_cv2(b64str):
-    import base64
-    data = base64.b64decode(b64str.encode('utf8'))
-    data = np.frombuffer(data, np.uint8)
-    data = cv2.imdecode(data, cv2.IMREAD_COLOR)
-    return data
-
-
-def draw_boxes(image, boxes, scores=None, drop_score=0.5):
-    if scores is None:
-        scores = [1] * len(boxes)
-    for (box, score) in zip(boxes, scores):
-        if score < drop_score:
-            continue
-        box = np.reshape(np.array(box), [-1, 1, 2]).astype(np.int64)
-        image = cv2.polylines(np.array(image), [box], True, (255, 0, 0), 2)
-    return image
-
-
-def get_rotate_crop_image(img, points):
-    '''
-    img_height, img_width = img.shape[0:2]
-    left = int(np.min(points[:, 0]))
-    right = int(np.max(points[:, 0]))
-    top = int(np.min(points[:, 1]))
-    bottom = int(np.max(points[:, 1]))
-    img_crop = img[top:bottom, left:right, :].copy()
-    points[:, 0] = points[:, 0] - left
-    points[:, 1] = points[:, 1] - top
-    '''
-    assert len(points) == 4, "shape of points must be 4*2"
-    img_crop_width = int(
-        max(
-            np.linalg.norm(points[0] - points[1]),
-            np.linalg.norm(points[2] - points[3])))
-    img_crop_height = int(
-        max(
-            np.linalg.norm(points[0] - points[3]),
-            np.linalg.norm(points[1] - points[2])))
-    pts_std = np.float32([[0, 0], [img_crop_width, 0],
-                          [img_crop_width, img_crop_height],
-                          [0, img_crop_height]])
-    M = cv2.getPerspectiveTransform(points, pts_std)
-    dst_img = cv2.warpPerspective(
-        img,
-        M, (img_crop_width, img_crop_height),
-        borderMode=cv2.BORDER_REPLICATE,
-        flags=cv2.INTER_CUBIC)
-    dst_img_height, dst_img_width = dst_img.shape[0:2]
-    if dst_img_height * 1.0 / dst_img_width >= 1.5:
-        dst_img = np.rot90(dst_img)
-    return dst_img
-
-
-def get_minarea_rect_crop(img, points):
-    bounding_box = cv2.minAreaRect(np.array(points).astype(np.int32))
-    points = sorted(list(cv2.boxPoints(bounding_box)), key=lambda x: x[0])
-
-    index_a, index_b, index_c, index_d = 0, 1, 2, 3
-    if points[1][1] > points[0][1]:
-        index_a = 0
-        index_d = 1
-    else:
-        index_a = 1
-        index_d = 0
-    if points[3][1] > points[2][1]:
-        index_b = 2
-        index_c = 3
-    else:
-        index_b = 3
-        index_c = 2
-
-    box = [points[index_a], points[index_b], points[index_c], points[index_d]]
-    crop_img = get_rotate_crop_image(img, np.array(box))
-    return crop_img, box
-
-
-def check_gpu(use_gpu):
-    if use_gpu and not paddle.is_compiled_with_cuda():
-        use_gpu = False
-    return use_gpu
-
-
-def _check_image_file(path):
-    img_end = {'jpg', 'bmp', 'png', 'jpeg', 'rgb', 'tif', 'tiff', 'gif', 'pdf'}
-    return any([path.lower().endswith(e) for e in img_end])
-
-
-def get_image_file_list(img_file):
-    imgs_lists = []
-    if img_file is None or not os.path.exists(img_file):
-        raise Exception("not found any img file in {}".format(img_file))
-
-    img_end = {'jpg', 'bmp', 'png', 'jpeg', 'rgb', 'tif', 'tiff', 'gif', 'pdf'}
-    if os.path.isfile(img_file) and _check_image_file(img_file):
-        imgs_lists.append(img_file)
-    elif os.path.isdir(img_file):
-        for single_file in os.listdir(img_file):
-            file_path = os.path.join(img_file, single_file)
-            if os.path.isfile(file_path) and _check_image_file(file_path):
-                imgs_lists.append(file_path)
-    if len(imgs_lists) == 0:
-        raise Exception("not found any img file in {}".format(img_file))
-    imgs_lists = sorted(imgs_lists)
-    return imgs_lists
-
-
-def check_and_read(img_path):
-    if os.path.basename(img_path)[-3:] in ['gif', 'GIF']:
-        gif = cv2.VideoCapture(img_path)
-        ret, frame = gif.read()
-        if not ret:
-            logger = logging.getLogger('ppocr')
-            logger.info("Cannot read {}. This gif image maybe corrupted.")
-            return None, False
-        if len(frame.shape) == 2 or frame.shape[-1] == 1:
-            frame = cv2.cvtColor(frame, cv2.COLOR_GRAY2RGB)
-        imgvalue = frame[:, :, ::-1]
-        return imgvalue, True, False
-    elif os.path.basename(img_path)[-3:] in ['pdf']:
-        import fitz
-        from PIL import Image
-        imgs = []
-        with fitz.open(img_path) as pdf:
-            for pg in range(0, pdf.pageCount):
-                page = pdf[pg]
-                mat = fitz.Matrix(2, 2)
-                pm = page.getPixmap(matrix=mat, alpha=False)
-
-                # if width or height > 2000 pixels, don't enlarge the image
-                if pm.width > 2000 or pm.height > 2000:
-                    pm = page.getPixmap(matrix=fitz.Matrix(1, 1), alpha=False)
-
-                img = Image.frombytes("RGB", [pm.width, pm.height], pm.samples)
-                img = cv2.cvtColor(np.array(img), cv2.COLOR_RGB2BGR)
-                imgs.append(img)
-            return imgs, False, True
-    return None, False, False
-
-
-def create_operators(op_param_list, global_config=None):
-    """
-    create operators based on the config
-
-    Args:
-        params(list): a dict list, used to create some operators
-    """
-    assert isinstance(op_param_list, list), ('operator config should be a list')
-    ops = []
-    for operator in op_param_list:
-        assert isinstance(operator,
-                          dict) and len(operator) == 1, "yaml format error"
-        op_name = list(operator)[0]
-        param = {} if operator[op_name] is None else operator[op_name]
-        if global_config is not None:
-            param.update(global_config)
-        op = eval(op_name)(**param)
-        ops.append(op)
-    return ops
-
-
-def transform(data, ops=None):
-    """ transform """
-    if ops is None:
-        ops = []
-    for op in ops:
-        data = op(data)
-        if data is None:
-            return None
-    return data
-
-
-if __name__ == '__main__':
-    pass
diff --git a/examples/PPOCR/PPOCR-System/python/utils/__init__.py b/examples/PPOCR/PPOCR-System/python/utils/__init__.py
new file mode 100644
index 0000000..e69de29
diff --git a/examples/PPOCR/PPOCR-System/python/utils/db_postprocess.py b/examples/PPOCR/PPOCR-System/python/utils/db_postprocess.py
new file mode 100644
index 0000000..ac50634
--- /dev/null
+++ b/examples/PPOCR/PPOCR-System/python/utils/db_postprocess.py
@@ -0,0 +1,269 @@
+# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""
+This code is refered from:
+https://github.com/WenmuZhou/DBNet.pytorch/blob/master/post_processing/seg_detector_representer.py
+"""
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import numpy as np
+import cv2
+# import paddle
+from shapely.geometry import Polygon
+import pyclipper
+
+
+class DBPostProcess(object):
+    """
+    The post process for Differentiable Binarization (DB).
+    """
+
+    def __init__(self,
+                 thresh=0.3,
+                 box_thresh=0.7,
+                 max_candidates=1000,
+                 unclip_ratio=2.0,
+                 use_dilation=False,
+                 score_mode="fast",
+                 **kwargs):
+        self.thresh = thresh
+        self.box_thresh = box_thresh
+        self.max_candidates = max_candidates
+        self.unclip_ratio = unclip_ratio
+        self.min_size = 3
+        self.score_mode = score_mode
+        assert score_mode in [
+            "slow", "fast"
+        ], "Score mode must be in [slow, fast] but got: {}".format(score_mode)
+
+        self.dilation_kernel = None if not use_dilation else np.array(
+            [[1, 1], [1, 1]])
+
+    def boxes_from_bitmap(self, pred, _bitmap, dest_width, dest_height):
+        '''
+        _bitmap: single map with shape (1, H, W),
+                whose values are binarized as {0, 1}
+        '''
+
+        bitmap = _bitmap
+        height, width = bitmap.shape
+
+        outs = cv2.findContours((bitmap * 255).astype(np.uint8), cv2.RETR_LIST,
+                                cv2.CHAIN_APPROX_SIMPLE)
+        if len(outs) == 3:
+            img, contours, _ = outs[0], outs[1], outs[2]
+        elif len(outs) == 2:
+            contours, _ = outs[0], outs[1]
+
+        num_contours = min(len(contours), self.max_candidates)
+
+        boxes = []
+        scores = []
+        for index in range(num_contours):
+            contour = contours[index]
+            points, sside = self.get_mini_boxes(contour)
+            if sside < self.min_size:
+                continue
+            points = np.array(points)
+            if self.score_mode == "fast":
+                score = self.box_score_fast(pred, points.reshape(-1, 2))
+            else:
+                score = self.box_score_slow(pred, contour)
+            if self.box_thresh > score:
+                continue
+
+            box = self.unclip(points).reshape(-1, 1, 2)
+            box, sside = self.get_mini_boxes(box)
+            if sside < self.min_size + 2:
+                continue
+            box = np.array(box)
+
+            box[:, 0] = np.clip(
+                np.round(box[:, 0] / width * dest_width), 0, dest_width)
+            box[:, 1] = np.clip(
+                np.round(box[:, 1] / height * dest_height), 0, dest_height)
+            boxes.append(box.astype(np.int16))
+            scores.append(score)
+        return np.array(boxes, dtype=np.int16), scores
+
+    def unclip(self, box):
+        unclip_ratio = self.unclip_ratio
+        poly = Polygon(box)
+        distance = poly.area * unclip_ratio / poly.length
+        offset = pyclipper.PyclipperOffset()
+        offset.AddPath(box, pyclipper.JT_ROUND, pyclipper.ET_CLOSEDPOLYGON)
+        expanded = np.array(offset.Execute(distance))
+        return expanded
+
+    def get_mini_boxes(self, contour):
+        bounding_box = cv2.minAreaRect(contour)
+        points = sorted(list(cv2.boxPoints(bounding_box)), key=lambda x: x[0])
+
+        index_1, index_2, index_3, index_4 = 0, 1, 2, 3
+        if points[1][1] > points[0][1]:
+            index_1 = 0
+            index_4 = 1
+        else:
+            index_1 = 1
+            index_4 = 0
+        if points[3][1] > points[2][1]:
+            index_2 = 2
+            index_3 = 3
+        else:
+            index_2 = 3
+            index_3 = 2
+
+        box = [
+            points[index_1], points[index_2], points[index_3], points[index_4]
+        ]
+        return box, min(bounding_box[1])
+
+    def box_score_fast(self, bitmap, _box):
+        '''
+        box_score_fast: use bbox mean score as the mean score
+        '''
+        h, w = bitmap.shape[:2]
+        box = _box.copy()
+        xmin = np.clip(np.floor(box[:, 0].min()).astype(np.int32), 0, w - 1)
+        xmax = np.clip(np.ceil(box[:, 0].max()).astype(np.int32), 0, w - 1)
+        ymin = np.clip(np.floor(box[:, 1].min()).astype(np.int32), 0, h - 1)
+        ymax = np.clip(np.ceil(box[:, 1].max()).astype(np.int32), 0, h - 1)
+
+        mask = np.zeros((ymax - ymin + 1, xmax - xmin + 1), dtype=np.uint8)
+        box[:, 0] = box[:, 0] - xmin
+        box[:, 1] = box[:, 1] - ymin
+        cv2.fillPoly(mask, box.reshape(1, -1, 2).astype(np.int32), 1)
+        return cv2.mean(bitmap[ymin:ymax + 1, xmin:xmax + 1], mask)[0]
+
+    def box_score_slow(self, bitmap, contour):
+        '''
+        box_score_slow: use polyon mean score as the mean score
+        '''
+        h, w = bitmap.shape[:2]
+        contour = contour.copy()
+        contour = np.reshape(contour, (-1, 2))
+
+        xmin = np.clip(np.min(contour[:, 0]), 0, w - 1)
+        xmax = np.clip(np.max(contour[:, 0]), 0, w - 1)
+        ymin = np.clip(np.min(contour[:, 1]), 0, h - 1)
+        ymax = np.clip(np.max(contour[:, 1]), 0, h - 1)
+
+        mask = np.zeros((ymax - ymin + 1, xmax - xmin + 1), dtype=np.uint8)
+
+        contour[:, 0] = contour[:, 0] - xmin
+        contour[:, 1] = contour[:, 1] - ymin
+
+        cv2.fillPoly(mask, contour.reshape(1, -1, 2).astype(np.int32), 1)
+        return cv2.mean(bitmap[ymin:ymax + 1, xmin:xmax + 1], mask)[0]
+
+    def __call__(self, outs_dict, shape_list):
+        pred = outs_dict['maps']
+        # if isinstance(pred, paddle.Tensor):
+        #     pred = pred.numpy()
+        pred = pred[:, 0, :, :]
+        segmentation = pred > self.thresh
+
+        boxes_batch = []
+        for batch_index in range(pred.shape[0]):
+            src_h, src_w, ratio_h, ratio_w = shape_list[batch_index]
+            if self.dilation_kernel is not None:
+                mask = cv2.dilate(
+                    np.array(segmentation[batch_index]).astype(np.uint8),
+                    self.dilation_kernel)
+            else:
+                mask = segmentation[batch_index]
+            boxes, scores = self.boxes_from_bitmap(pred[batch_index], mask,
+                                                   src_w, src_h)
+
+            boxes_batch.append({'points': boxes})
+        return boxes_batch
+
+
+class DistillationDBPostProcess(object):
+    def __init__(self,
+                 model_name=["student"],
+                 key=None,
+                 thresh=0.3,
+                 box_thresh=0.6,
+                 max_candidates=1000,
+                 unclip_ratio=1.5,
+                 use_dilation=False,
+                 score_mode="fast",
+                 **kwargs):
+        self.model_name = model_name
+        self.key = key
+        self.post_process = DBPostProcess(
+            thresh=thresh,
+            box_thresh=box_thresh,
+            max_candidates=max_candidates,
+            unclip_ratio=unclip_ratio,
+            use_dilation=use_dilation,
+            score_mode=score_mode)
+
+    def __call__(self, predicts, shape_list):
+        results = {}
+        for k in self.model_name:
+            results[k] = self.post_process(predicts[k], shape_list=shape_list)
+        return results
+
+
+class DetPostProcess(object):
+    def __init__(self) -> None:
+        pass
+
+    def order_points_clockwise(self, pts):
+        """
+        reference from: https://github.com/jrosebr1/imutils/blob/master/imutils/perspective.py
+        # sort the points based on their x-coordinates
+        """
+        xSorted = pts[np.argsort(pts[:, 0]), :]
+
+        # grab the left-most and right-most points from the sorted
+        # x-roodinate points
+        leftMost = xSorted[:2, :]
+        rightMost = xSorted[2:, :]
+
+        # now, sort the left-most coordinates according to their
+        # y-coordinates so we can grab the top-left and bottom-left
+        # points, respectively
+        leftMost = leftMost[np.argsort(leftMost[:, 1]), :]
+        (tl, bl) = leftMost
+
+        rightMost = rightMost[np.argsort(rightMost[:, 1]), :]
+        (tr, br) = rightMost
+
+        rect = np.array([tl, tr, br, bl], dtype="float32")
+        return rect
+
+    def clip_det_res(self, points, img_height, img_width):
+        for pno in range(points.shape[0]):
+            points[pno, 0] = int(min(max(points[pno, 0], 0), img_width - 1))
+            points[pno, 1] = int(min(max(points[pno, 1], 0), img_height - 1))
+        return points
+
+    def filter_tag_det_res(self, dt_boxes, image_shape):
+        img_height, img_width = image_shape[0:2]
+        dt_boxes_new = []
+        for box in dt_boxes:
+            box = self.order_points_clockwise(box)
+            box = self.clip_det_res(box, img_height, img_width)
+            rect_width = int(np.linalg.norm(box[0] - box[1]))
+            rect_height = int(np.linalg.norm(box[0] - box[3]))
+            if rect_width <= 3 or rect_height <= 3:
+                continue
+            dt_boxes_new.append(box)
+        dt_boxes = np.array(dt_boxes_new)
+        return dt_boxes
diff --git a/examples/PPOCR/PPOCR-System/python/utils/operators.py b/examples/PPOCR/PPOCR-System/python/utils/operators.py
new file mode 100644
index 0000000..f19c15f
--- /dev/null
+++ b/examples/PPOCR/PPOCR-System/python/utils/operators.py
@@ -0,0 +1,373 @@
+"""
+# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+from __future__ import unicode_literals
+
+import sys
+import six
+import cv2
+import numpy as np
+
+
+class DecodeImage(object):
+    """ decode image """
+
+    def __init__(self, img_mode='RGB', channel_first=False, **kwargs):
+        self.img_mode = img_mode
+        self.channel_first = channel_first
+
+    def __call__(self, data):
+        img = data['image']
+        if six.PY2:
+            assert type(img) is str and len(
+                img) > 0, "invalid input 'img' in DecodeImage"
+        else:
+            assert type(img) is bytes and len(
+                img) > 0, "invalid input 'img' in DecodeImage"
+        img = np.frombuffer(img, dtype='uint8')
+        img = cv2.imdecode(img, 1)
+        if img is None:
+            return None
+        if self.img_mode == 'GRAY':
+            img = cv2.cvtColor(img, cv2.COLOR_GRAY2BGR)
+        elif self.img_mode == 'RGB':
+            assert img.shape[2] == 3, 'invalid shape of image[%s]' % (img.shape)
+            img = img[:, :, ::-1]
+
+        if self.channel_first:
+            img = img.transpose((2, 0, 1))
+
+        data['image'] = img
+        return data
+
+
+class NRTRDecodeImage(object):
+    """ decode image """
+
+    def __init__(self, img_mode='RGB', channel_first=False, **kwargs):
+        self.img_mode = img_mode
+        self.channel_first = channel_first
+
+    def __call__(self, data):
+        img = data['image']
+        if six.PY2:
+            assert type(img) is str and len(
+                img) > 0, "invalid input 'img' in DecodeImage"
+        else:
+            assert type(img) is bytes and len(
+                img) > 0, "invalid input 'img' in DecodeImage"
+        img = np.frombuffer(img, dtype='uint8')
+
+        img = cv2.imdecode(img, 1)
+
+        if img is None:
+            return None
+        if self.img_mode == 'GRAY':
+            img = cv2.cvtColor(img, cv2.COLOR_GRAY2BGR)
+        elif self.img_mode == 'RGB':
+            assert img.shape[2] == 3, 'invalid shape of image[%s]' % (img.shape)
+            img = img[:, :, ::-1]
+        img = cv2.cvtColor(img,cv2.COLOR_BGR2GRAY)
+        if self.channel_first:
+            img = img.transpose((2, 0, 1))
+        data['image'] = img
+        return data
+
+class NormalizeImage(object):
+    """ normalize image such as substract mean, divide std
+    """
+
+    def __init__(self, scale=None, mean=None, std=None, order='chw', **kwargs):
+        if isinstance(scale, str):
+            scale = eval(scale)
+        self.scale = np.float32(scale if scale is not None else 1.0 / 255.0)
+        mean = mean if mean is not None else [0.485, 0.456, 0.406]
+        std = std if std is not None else [0.229, 0.224, 0.225]
+
+        shape = (3, 1, 1) if order == 'chw' else (1, 1, 3)
+        self.mean = np.array(mean).reshape(shape).astype('float32')
+        self.std = np.array(std).reshape(shape).astype('float32')
+
+    def __call__(self, data):
+        img = data['image']
+        from PIL import Image
+        if isinstance(img, Image.Image):
+            img = np.array(img)
+
+        assert isinstance(img,
+                          np.ndarray), "invalid input 'img' in NormalizeImage"
+        data['image'] = (
+                                img.astype('float32') * self.scale - self.mean) / self.std
+        return data
+
+
+class ToCHWImage(object):
+    """ convert hwc image to chw image
+    """
+
+    def __init__(self, **kwargs):
+        pass
+
+    def __call__(self, data):
+        img = data['image']
+        from PIL import Image
+        if isinstance(img, Image.Image):
+            img = np.array(img)
+        data['image'] = img.transpose((2, 0, 1))
+        return data
+
+
+class KeepKeys(object):
+    def __init__(self, keep_keys, **kwargs):
+        self.keep_keys = keep_keys
+
+    def __call__(self, data):
+        data_list = []
+        for key in self.keep_keys:
+            data_list.append(data[key])
+        return data_list
+
+
+class DetResizeForTest(object):
+    def __init__(self, **kwargs):
+        super(DetResizeForTest, self).__init__()
+        self.square_input = True
+        self.resize_type = 0
+        if 'image_shape' in kwargs:
+            self.image_shape = kwargs['image_shape']
+            self.resize_type = 1
+        elif 'limit_side_len' in kwargs:
+            self.limit_side_len = kwargs['limit_side_len']
+            self.limit_type = kwargs.get('limit_type', 'min')
+        elif 'resize_long' in kwargs:
+            self.resize_type = 2
+            self.resize_long = kwargs.get('resize_long', 960)
+        else:
+            self.limit_side_len = 736
+            self.limit_type = 'min'
+
+    def __call__(self, data):
+        img = data['image']
+        src_h, src_w, _ = img.shape
+
+        if self.resize_type == 0:
+            # img, shape = self.resize_image_type0(img)
+            img, [ratio_h, ratio_w] = self.resize_image_type0(img)
+        elif self.resize_type == 2:
+            img, [ratio_h, ratio_w] = self.resize_image_type2(img)
+        else:
+            # img, shape = self.resize_image_type1(img)
+            img, [ratio_h, ratio_w] = self.resize_image_type1(img)
+        
+
+
+        data['image'] = img
+        data['shape'] = np.array([src_h, src_w, ratio_h, ratio_w])
+        if len(data['shape'].shape) == 1:
+            data['shape'] = np.expand_dims(data['shape'], axis=0) 
+        return data
+
+    def resize_image_type1(self, img):
+        resize_h, resize_w = self.image_shape
+        ori_h, ori_w = img.shape[:2]  # (h, w, c)
+        ratio_h = float(resize_h) / ori_h
+        ratio_w = float(resize_w) / ori_w
+        img = cv2.resize(img, (int(resize_w), int(resize_h)))
+        # return img, np.array([ori_h, ori_w])
+        return img, [ratio_h, ratio_w]
+
+    def resize_image_type0(self, img):
+        """
+        resize image to a size multiple of 32 which is required by the network
+        args:
+            img(array): array with shape [h, w, c]
+        return(tuple):
+            img, (ratio_h, ratio_w)
+        """
+        limit_side_len = self.limit_side_len
+        h, w, c = img.shape
+
+        # limit the max side
+        if self.limit_type == 'max':
+            if max(h, w) > limit_side_len:
+                if h > w:
+                    ratio = float(limit_side_len) / h
+                else:
+                    ratio = float(limit_side_len) / w
+            else:
+                ratio = 1.
+        elif self.limit_type == 'min':
+            if min(h, w) < limit_side_len:
+                if h < w:
+                    ratio = float(limit_side_len) / h
+                else:
+                    ratio = float(limit_side_len) / w
+            else:
+                ratio = 1.
+        elif self.limit_type == 'resize_long':
+            ratio = float(limit_side_len) / max(h,w)
+        else:
+            raise Exception('not support limit type, image ')
+        resize_h = int(h * ratio)
+        resize_w = int(w * ratio)
+
+        resize_h = max(int(round(resize_h / 32) * 32), 32)
+        resize_w = max(int(round(resize_w / 32) * 32), 32)
+
+        try:
+            if int(resize_w) <= 0 or int(resize_h) <= 0:
+                return None, (None, None)
+            img = cv2.resize(img, (int(resize_w), int(resize_h)))
+        except:
+            print(img.shape, resize_w, resize_h)
+            sys.exit(0)
+        ratio_h = resize_h / float(h)
+        ratio_w = resize_w / float(w)
+        return img, [ratio_h, ratio_w]
+
+    def resize_image_type2(self, img):
+        h, w, _ = img.shape
+
+        resize_w = w
+        resize_h = h
+
+        if resize_h > resize_w:
+            ratio = float(self.resize_long) / resize_h
+        else:
+            ratio = float(self.resize_long) / resize_w
+
+        resize_h = int(resize_h * ratio)
+        resize_w = int(resize_w * ratio)
+
+        max_stride = 128
+        resize_h = (resize_h + max_stride - 1) // max_stride * max_stride
+        resize_w = (resize_w + max_stride - 1) // max_stride * max_stride
+        img = cv2.resize(img, (int(resize_w), int(resize_h)))
+        ratio_h = resize_h / float(h)
+        ratio_w = resize_w / float(w)
+
+        return img, [ratio_h, ratio_w]
+
+
+class E2EResizeForTest(object):
+    def __init__(self, **kwargs):
+        super(E2EResizeForTest, self).__init__()
+        self.max_side_len = kwargs['max_side_len']
+        self.valid_set = kwargs['valid_set']
+
+    def __call__(self, data):
+        img = data['image']
+        src_h, src_w, _ = img.shape
+        if self.valid_set == 'totaltext':
+            im_resized, [ratio_h, ratio_w] = self.resize_image_for_totaltext(
+                img, max_side_len=self.max_side_len)
+        else:
+            im_resized, (ratio_h, ratio_w) = self.resize_image(
+                img, max_side_len=self.max_side_len)
+        data['image'] = im_resized
+        data['shape'] = np.array([src_h, src_w, ratio_h, ratio_w])
+        return data
+
+    def resize_image_for_totaltext(self, im, max_side_len=512):
+
+        h, w, _ = im.shape
+        resize_w = w
+        resize_h = h
+        ratio = 1.25
+        if h * ratio > max_side_len:
+            ratio = float(max_side_len) / resize_h
+        resize_h = int(resize_h * ratio)
+        resize_w = int(resize_w * ratio)
+
+        max_stride = 128
+        resize_h = (resize_h + max_stride - 1) // max_stride * max_stride
+        resize_w = (resize_w + max_stride - 1) // max_stride * max_stride
+        im = cv2.resize(im, (int(resize_w), int(resize_h)))
+        ratio_h = resize_h / float(h)
+        ratio_w = resize_w / float(w)
+        return im, (ratio_h, ratio_w)
+
+    def resize_image(self, im, max_side_len=512):
+        """
+        resize image to a size multiple of max_stride which is required by the network
+        :param im: the resized image
+        :param max_side_len: limit of max image size to avoid out of memory in gpu
+        :return: the resized image and the resize ratio
+        """
+        h, w, _ = im.shape
+
+        resize_w = w
+        resize_h = h
+
+        # Fix the longer side
+        if resize_h > resize_w:
+            ratio = float(max_side_len) / resize_h
+        else:
+            ratio = float(max_side_len) / resize_w
+
+        resize_h = int(resize_h * ratio)
+        resize_w = int(resize_w * ratio)
+
+        max_stride = 128
+        resize_h = (resize_h + max_stride - 1) // max_stride * max_stride
+        resize_w = (resize_w + max_stride - 1) // max_stride * max_stride
+        im = cv2.resize(im, (int(resize_w), int(resize_h)))
+        ratio_h = resize_h / float(h)
+        ratio_w = resize_w / float(w)
+
+        return im, (ratio_h, ratio_w)
+
+
+
+class Pad_to_max_len(object):
+    def __init__(self, **kwargs):
+        super(Pad_to_max_len, self).__init__()
+        self.max_h = kwargs['max_h']
+        self.max_w = kwargs['max_w']
+
+    def __call__(self, data):
+        img = data['image']
+        if img.shape[-1] == 3:
+            # hwc
+            if img.shape[0]!= self.max_h:
+                # TODO support 
+                # assert False, "not support"
+                pad_h = self.max_h - img.shape[0]
+                pad_w = self.max_w - img.shape[1]
+                img = np.pad(img, ((0, pad_h), (0, pad_w), (0, 0)), 'constant', constant_values=0)
+            if img.shape[1] < self.max_w:
+                pad_w = self.max_w - img.shape[1]
+                img = np.pad(img, ((0, 0), (0, pad_w), (0, 0)), 'constant', constant_values=0)
+
+        elif img.shape[0] == 3:
+            # chw
+            img = img.transpose((1, 2, 0))
+            if img.shape[1]!= self.max_h:
+                # TODO support 
+                assert False, "not support"
+            if img.shape[0] < self.max_w:
+                pad_w = self.max_w - img.shape[0]
+                img = np.pad(img, ((0, 0), (0, 0), (0, pad_w)), 'constant', constant_values=0)
+
+        else:
+            assert False, "not support"
+
+        data['image'] = img
+
+        return data
\ No newline at end of file
diff --git a/examples/PPOCR/PPOCR-System/python/utils/rec_postprocess.py b/examples/PPOCR/PPOCR-System/python/utils/rec_postprocess.py
new file mode 100644
index 0000000..3aa3585
--- /dev/null
+++ b/examples/PPOCR/PPOCR-System/python/utils/rec_postprocess.py
@@ -0,0 +1,814 @@
+# copyright (c) 2020 PaddlePaddle Authors. All Rights Reserve.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import numpy as np
+# import paddle
+# from paddle.nn import functional as F
+import re
+
+
+class BaseRecLabelDecode(object):
+    """ Convert between text-label and text-index """
+
+    def __init__(self, character_dict_path=None, use_space_char=False):
+        self.beg_str = "sos"
+        self.end_str = "eos"
+
+        self.character_str = []
+        if character_dict_path is None:
+            self.character_str = "0123456789abcdefghijklmnopqrstuvwxyz"
+            dict_character = list(self.character_str)
+        else:
+            with open(character_dict_path, "rb") as fin:
+                lines = fin.readlines()
+                for line in lines:
+                    line = line.decode('utf-8').strip("\n").strip("\r\n")
+                    self.character_str.append(line)
+            if use_space_char:
+                self.character_str.append(" ")
+            dict_character = list(self.character_str)
+
+        dict_character = self.add_special_char(dict_character)
+        self.dict = {}
+        for i, char in enumerate(dict_character):
+            self.dict[char] = i
+        self.character = dict_character
+
+        if 'arabic' in character_dict_path:
+            self.reverse = True
+        else:
+            self.reverse = False
+
+    def pred_reverse(self, pred):
+        pred_re = []
+        c_current = ''
+        for c in pred:
+            if not bool(re.search('[a-zA-Z0-9 :*./%+-]', c)):
+                if c_current != '':
+                    pred_re.append(c_current)
+                pred_re.append(c)
+                c_current = ''
+            else:
+                c_current += c
+        if c_current != '':
+            pred_re.append(c_current)
+
+        return ''.join(pred_re[::-1])
+
+    def add_special_char(self, dict_character):
+        return dict_character
+
+    def decode(self, text_index, text_prob=None, is_remove_duplicate=False):
+        """ convert text-index into text-label. """
+        result_list = []
+        ignored_tokens = self.get_ignored_tokens()
+        batch_size = len(text_index)
+        for batch_idx in range(batch_size):
+            selection = np.ones(len(text_index[batch_idx]), dtype=bool)
+            if is_remove_duplicate:
+                selection[1:] = text_index[batch_idx][1:] != text_index[
+                    batch_idx][:-1]
+            for ignored_token in ignored_tokens:
+                selection &= text_index[batch_idx] != ignored_token
+
+            char_list = [
+                self.character[text_id]
+                for text_id in text_index[batch_idx][selection]
+            ]
+            if text_prob is not None:
+                conf_list = text_prob[batch_idx][selection]
+            else:
+                conf_list = [1] * len(selection)
+            if len(conf_list) == 0:
+                conf_list = [0]
+
+            text = ''.join(char_list)
+
+            if self.reverse:  # for arabic rec
+                text = self.pred_reverse(text)
+
+            result_list.append((text, np.mean(conf_list).tolist()))
+        return result_list
+
+    def get_ignored_tokens(self):
+        return [0]  # for ctc blank
+
+
+class CTCLabelDecode(BaseRecLabelDecode):
+    """ Convert between text-label and text-index """
+
+    def __init__(self, character_dict_path=None, use_space_char=False,
+                 **kwargs):
+        super(CTCLabelDecode, self).__init__(character_dict_path,
+                                             use_space_char)
+
+    def __call__(self, preds, label=None, *args, **kwargs):
+        if isinstance(preds, tuple) or isinstance(preds, list):
+            preds = preds[-1]
+        # if isinstance(preds, paddle.Tensor):
+        #     preds = preds.numpy()
+        preds_idx = preds.argmax(axis=2)
+        preds_prob = preds.max(axis=2)
+        text = self.decode(preds_idx, preds_prob, is_remove_duplicate=True)
+        if label is None:
+            return text
+        label = self.decode(label)
+        return text, label
+
+    def add_special_char(self, dict_character):
+        dict_character = ['blank'] + dict_character
+        return dict_character
+
+
+class DistillationCTCLabelDecode(CTCLabelDecode):
+    """
+    Convert 
+    Convert between text-label and text-index
+    """
+
+    def __init__(self,
+                 character_dict_path=None,
+                 use_space_char=False,
+                 model_name=["student"],
+                 key=None,
+                 multi_head=False,
+                 **kwargs):
+        super(DistillationCTCLabelDecode, self).__init__(character_dict_path,
+                                                         use_space_char)
+        if not isinstance(model_name, list):
+            model_name = [model_name]
+        self.model_name = model_name
+
+        self.key = key
+        self.multi_head = multi_head
+
+    def __call__(self, preds, label=None, *args, **kwargs):
+        output = dict()
+        for name in self.model_name:
+            pred = preds[name]
+            if self.key is not None:
+                pred = pred[self.key]
+            if self.multi_head and isinstance(pred, dict):
+                pred = pred['ctc']
+            output[name] = super().__call__(pred, label=label, *args, **kwargs)
+        return output
+
+
+class AttnLabelDecode(BaseRecLabelDecode):
+    """ Convert between text-label and text-index """
+
+    def __init__(self, character_dict_path=None, use_space_char=False,
+                 **kwargs):
+        super(AttnLabelDecode, self).__init__(character_dict_path,
+                                              use_space_char)
+
+    def add_special_char(self, dict_character):
+        self.beg_str = "sos"
+        self.end_str = "eos"
+        dict_character = dict_character
+        dict_character = [self.beg_str] + dict_character + [self.end_str]
+        return dict_character
+
+    def decode(self, text_index, text_prob=None, is_remove_duplicate=False):
+        """ convert text-index into text-label. """
+        result_list = []
+        ignored_tokens = self.get_ignored_tokens()
+        [beg_idx, end_idx] = self.get_ignored_tokens()
+        batch_size = len(text_index)
+        for batch_idx in range(batch_size):
+            char_list = []
+            conf_list = []
+            for idx in range(len(text_index[batch_idx])):
+                if text_index[batch_idx][idx] in ignored_tokens:
+                    continue
+                if int(text_index[batch_idx][idx]) == int(end_idx):
+                    break
+                if is_remove_duplicate:
+                    # only for predict
+                    if idx > 0 and text_index[batch_idx][idx - 1] == text_index[
+                            batch_idx][idx]:
+                        continue
+                char_list.append(self.character[int(text_index[batch_idx][
+                    idx])])
+                if text_prob is not None:
+                    conf_list.append(text_prob[batch_idx][idx])
+                else:
+                    conf_list.append(1)
+            text = ''.join(char_list)
+            result_list.append((text, np.mean(conf_list).tolist()))
+        return result_list
+
+    def __call__(self, preds, label=None, *args, **kwargs):
+        """
+        text = self.decode(text)
+        if label is None:
+            return text
+        else:
+            label = self.decode(label, is_remove_duplicate=False)
+            return text, label
+        """
+        # if isinstance(preds, paddle.Tensor):
+        #     preds = preds.numpy()
+
+        preds_idx = preds.argmax(axis=2)
+        preds_prob = preds.max(axis=2)
+        text = self.decode(preds_idx, preds_prob, is_remove_duplicate=False)
+        if label is None:
+            return text
+        label = self.decode(label, is_remove_duplicate=False)
+        return text, label
+
+    def get_ignored_tokens(self):
+        beg_idx = self.get_beg_end_flag_idx("beg")
+        end_idx = self.get_beg_end_flag_idx("end")
+        return [beg_idx, end_idx]
+
+    def get_beg_end_flag_idx(self, beg_or_end):
+        if beg_or_end == "beg":
+            idx = np.array(self.dict[self.beg_str])
+        elif beg_or_end == "end":
+            idx = np.array(self.dict[self.end_str])
+        else:
+            assert False, "unsupport type %s in get_beg_end_flag_idx" \
+                          % beg_or_end
+        return idx
+
+
+class SEEDLabelDecode(BaseRecLabelDecode):
+    """ Convert between text-label and text-index """
+
+    def __init__(self, character_dict_path=None, use_space_char=False,
+                 **kwargs):
+        super(SEEDLabelDecode, self).__init__(character_dict_path,
+                                              use_space_char)
+
+    def add_special_char(self, dict_character):
+        self.padding_str = "padding"
+        self.end_str = "eos"
+        self.unknown = "unknown"
+        dict_character = dict_character + [
+            self.end_str, self.padding_str, self.unknown
+        ]
+        return dict_character
+
+    def get_ignored_tokens(self):
+        end_idx = self.get_beg_end_flag_idx("eos")
+        return [end_idx]
+
+    def get_beg_end_flag_idx(self, beg_or_end):
+        if beg_or_end == "sos":
+            idx = np.array(self.dict[self.beg_str])
+        elif beg_or_end == "eos":
+            idx = np.array(self.dict[self.end_str])
+        else:
+            assert False, "unsupport type %s in get_beg_end_flag_idx" % beg_or_end
+        return idx
+
+    def decode(self, text_index, text_prob=None, is_remove_duplicate=False):
+        """ convert text-index into text-label. """
+        result_list = []
+        [end_idx] = self.get_ignored_tokens()
+        batch_size = len(text_index)
+        for batch_idx in range(batch_size):
+            char_list = []
+            conf_list = []
+            for idx in range(len(text_index[batch_idx])):
+                if int(text_index[batch_idx][idx]) == int(end_idx):
+                    break
+                if is_remove_duplicate:
+                    # only for predict
+                    if idx > 0 and text_index[batch_idx][idx - 1] == text_index[
+                            batch_idx][idx]:
+                        continue
+                char_list.append(self.character[int(text_index[batch_idx][
+                    idx])])
+                if text_prob is not None:
+                    conf_list.append(text_prob[batch_idx][idx])
+                else:
+                    conf_list.append(1)
+            text = ''.join(char_list)
+            result_list.append((text, np.mean(conf_list).tolist()))
+        return result_list
+
+    def __call__(self, preds, label=None, *args, **kwargs):
+        """
+        text = self.decode(text)
+        if label is None:
+            return text
+        else:
+            label = self.decode(label, is_remove_duplicate=False)
+            return text, label
+        """
+        preds_idx = preds["rec_pred"]
+        # if isinstance(preds_idx, paddle.Tensor):
+        #     preds_idx = preds_idx.numpy()
+        if "rec_pred_scores" in preds:
+            preds_idx = preds["rec_pred"]
+            preds_prob = preds["rec_pred_scores"]
+        else:
+            preds_idx = preds["rec_pred"].argmax(axis=2)
+            preds_prob = preds["rec_pred"].max(axis=2)
+        text = self.decode(preds_idx, preds_prob, is_remove_duplicate=False)
+        if label is None:
+            return text
+        label = self.decode(label, is_remove_duplicate=False)
+        return text, label
+
+
+class SRNLabelDecode(BaseRecLabelDecode):
+    """ Convert between text-label and text-index """
+
+    def __init__(self, character_dict_path=None, use_space_char=False,
+                 **kwargs):
+        super(SRNLabelDecode, self).__init__(character_dict_path,
+                                             use_space_char)
+        self.max_text_length = kwargs.get('max_text_length', 25)
+
+    def __call__(self, preds, label=None, *args, **kwargs):
+        pred = preds['predict']
+        char_num = len(self.character_str) + 2
+        # if isinstance(pred, paddle.Tensor):
+        #     pred = pred.numpy()
+        pred = np.reshape(pred, [-1, char_num])
+
+        preds_idx = np.argmax(pred, axis=1)
+        preds_prob = np.max(pred, axis=1)
+
+        preds_idx = np.reshape(preds_idx, [-1, self.max_text_length])
+
+        preds_prob = np.reshape(preds_prob, [-1, self.max_text_length])
+
+        text = self.decode(preds_idx, preds_prob)
+
+        if label is None:
+            text = self.decode(preds_idx, preds_prob, is_remove_duplicate=False)
+            return text
+        label = self.decode(label)
+        return text, label
+
+    def decode(self, text_index, text_prob=None, is_remove_duplicate=False):
+        """ convert text-index into text-label. """
+        result_list = []
+        ignored_tokens = self.get_ignored_tokens()
+        batch_size = len(text_index)
+
+        for batch_idx in range(batch_size):
+            char_list = []
+            conf_list = []
+            for idx in range(len(text_index[batch_idx])):
+                if text_index[batch_idx][idx] in ignored_tokens:
+                    continue
+                if is_remove_duplicate:
+                    # only for predict
+                    if idx > 0 and text_index[batch_idx][idx - 1] == text_index[
+                            batch_idx][idx]:
+                        continue
+                char_list.append(self.character[int(text_index[batch_idx][
+                    idx])])
+                if text_prob is not None:
+                    conf_list.append(text_prob[batch_idx][idx])
+                else:
+                    conf_list.append(1)
+
+            text = ''.join(char_list)
+            result_list.append((text, np.mean(conf_list).tolist()))
+        return result_list
+
+    def add_special_char(self, dict_character):
+        dict_character = dict_character + [self.beg_str, self.end_str]
+        return dict_character
+
+    def get_ignored_tokens(self):
+        beg_idx = self.get_beg_end_flag_idx("beg")
+        end_idx = self.get_beg_end_flag_idx("end")
+        return [beg_idx, end_idx]
+
+    def get_beg_end_flag_idx(self, beg_or_end):
+        if beg_or_end == "beg":
+            idx = np.array(self.dict[self.beg_str])
+        elif beg_or_end == "end":
+            idx = np.array(self.dict[self.end_str])
+        else:
+            assert False, "unsupport type %s in get_beg_end_flag_idx" \
+                          % beg_or_end
+        return idx
+
+
+class SARLabelDecode(BaseRecLabelDecode):
+    """ Convert between text-label and text-index """
+
+    def __init__(self, character_dict_path=None, use_space_char=False,
+                 **kwargs):
+        super(SARLabelDecode, self).__init__(character_dict_path,
+                                             use_space_char)
+
+        self.rm_symbol = kwargs.get('rm_symbol', False)
+
+    def add_special_char(self, dict_character):
+        beg_end_str = "<BOS/EOS>"
+        unknown_str = "<UKN>"
+        padding_str = "<PAD>"
+        dict_character = dict_character + [unknown_str]
+        self.unknown_idx = len(dict_character) - 1
+        dict_character = dict_character + [beg_end_str]
+        self.start_idx = len(dict_character) - 1
+        self.end_idx = len(dict_character) - 1
+        dict_character = dict_character + [padding_str]
+        self.padding_idx = len(dict_character) - 1
+        return dict_character
+
+    def decode(self, text_index, text_prob=None, is_remove_duplicate=False):
+        """ convert text-index into text-label. """
+        result_list = []
+        ignored_tokens = self.get_ignored_tokens()
+
+        batch_size = len(text_index)
+        for batch_idx in range(batch_size):
+            char_list = []
+            conf_list = []
+            for idx in range(len(text_index[batch_idx])):
+                if text_index[batch_idx][idx] in ignored_tokens:
+                    continue
+                if int(text_index[batch_idx][idx]) == int(self.end_idx):
+                    if text_prob is None and idx == 0:
+                        continue
+                    else:
+                        break
+                if is_remove_duplicate:
+                    # only for predict
+                    if idx > 0 and text_index[batch_idx][idx - 1] == text_index[
+                            batch_idx][idx]:
+                        continue
+                char_list.append(self.character[int(text_index[batch_idx][
+                    idx])])
+                if text_prob is not None:
+                    conf_list.append(text_prob[batch_idx][idx])
+                else:
+                    conf_list.append(1)
+            text = ''.join(char_list)
+            if self.rm_symbol:
+                comp = re.compile('[^A-Z^a-z^0-9^\u4e00-\u9fa5]')
+                text = text.lower()
+                text = comp.sub('', text)
+            result_list.append((text, np.mean(conf_list).tolist()))
+        return result_list
+
+    def __call__(self, preds, label=None, *args, **kwargs):
+        # if isinstance(preds, paddle.Tensor):
+        #     preds = preds.numpy()
+        preds_idx = preds.argmax(axis=2)
+        preds_prob = preds.max(axis=2)
+
+        text = self.decode(preds_idx, preds_prob, is_remove_duplicate=False)
+
+        if label is None:
+            return text
+        label = self.decode(label, is_remove_duplicate=False)
+        return text, label
+
+    def get_ignored_tokens(self):
+        return [self.padding_idx]
+
+
+class DistillationSARLabelDecode(SARLabelDecode):
+    """
+    Convert 
+    Convert between text-label and text-index
+    """
+
+    def __init__(self,
+                 character_dict_path=None,
+                 use_space_char=False,
+                 model_name=["student"],
+                 key=None,
+                 multi_head=False,
+                 **kwargs):
+        super(DistillationSARLabelDecode, self).__init__(character_dict_path,
+                                                         use_space_char)
+        if not isinstance(model_name, list):
+            model_name = [model_name]
+        self.model_name = model_name
+
+        self.key = key
+        self.multi_head = multi_head
+
+    def __call__(self, preds, label=None, *args, **kwargs):
+        output = dict()
+        for name in self.model_name:
+            pred = preds[name]
+            if self.key is not None:
+                pred = pred[self.key]
+            if self.multi_head and isinstance(pred, dict):
+                pred = pred['sar']
+            output[name] = super().__call__(pred, label=label, *args, **kwargs)
+        return output
+
+
+class PRENLabelDecode(BaseRecLabelDecode):
+    """ Convert between text-label and text-index """
+
+    def __init__(self, character_dict_path=None, use_space_char=False,
+                 **kwargs):
+        super(PRENLabelDecode, self).__init__(character_dict_path,
+                                              use_space_char)
+
+    def add_special_char(self, dict_character):
+        padding_str = '<PAD>'  # 0 
+        end_str = '<EOS>'  # 1
+        unknown_str = '<UNK>'  # 2
+
+        dict_character = [padding_str, end_str, unknown_str] + dict_character
+        self.padding_idx = 0
+        self.end_idx = 1
+        self.unknown_idx = 2
+
+        return dict_character
+
+    def decode(self, text_index, text_prob=None):
+        """ convert text-index into text-label. """
+        result_list = []
+        batch_size = len(text_index)
+
+        for batch_idx in range(batch_size):
+            char_list = []
+            conf_list = []
+            for idx in range(len(text_index[batch_idx])):
+                if text_index[batch_idx][idx] == self.end_idx:
+                    break
+                if text_index[batch_idx][idx] in \
+                    [self.padding_idx, self.unknown_idx]:
+                    continue
+                char_list.append(self.character[int(text_index[batch_idx][
+                    idx])])
+                if text_prob is not None:
+                    conf_list.append(text_prob[batch_idx][idx])
+                else:
+                    conf_list.append(1)
+
+            text = ''.join(char_list)
+            if len(text) > 0:
+                result_list.append((text, np.mean(conf_list).tolist()))
+            else:
+                # here confidence of empty recog result is 1
+                result_list.append(('', 1))
+        return result_list
+
+    def __call__(self, preds, label=None, *args, **kwargs):
+        preds = preds.numpy()
+        preds_idx = preds.argmax(axis=2)
+        preds_prob = preds.max(axis=2)
+        text = self.decode(preds_idx, preds_prob)
+        if label is None:
+            return text
+        label = self.decode(label)
+        return text, label
+
+
+class NRTRLabelDecode(BaseRecLabelDecode):
+    """ Convert between text-label and text-index """
+
+    def __init__(self, character_dict_path=None, use_space_char=True, **kwargs):
+        super(NRTRLabelDecode, self).__init__(character_dict_path,
+                                              use_space_char)
+
+    def __call__(self, preds, label=None, *args, **kwargs):
+
+        if len(preds) == 2:
+            preds_id = preds[0]
+            preds_prob = preds[1]
+            # if isinstance(preds_id, paddle.Tensor):
+            #     preds_id = preds_id.numpy()
+            # if isinstance(preds_prob, paddle.Tensor):
+            #     preds_prob = preds_prob.numpy()
+            if preds_id[0][0] == 2:
+                preds_idx = preds_id[:, 1:]
+                preds_prob = preds_prob[:, 1:]
+            else:
+                preds_idx = preds_id
+            text = self.decode(preds_idx, preds_prob, is_remove_duplicate=False)
+            if label is None:
+                return text
+            label = self.decode(label[:, 1:])
+        else:
+            # if isinstance(preds, paddle.Tensor):
+            #     preds = preds.numpy()
+            preds_idx = preds.argmax(axis=2)
+            preds_prob = preds.max(axis=2)
+            text = self.decode(preds_idx, preds_prob, is_remove_duplicate=False)
+            if label is None:
+                return text
+            label = self.decode(label[:, 1:])
+        return text, label
+
+    def add_special_char(self, dict_character):
+        dict_character = ['blank', '<unk>', '<s>', '</s>'] + dict_character
+        return dict_character
+
+    def decode(self, text_index, text_prob=None, is_remove_duplicate=False):
+        """ convert text-index into text-label. """
+        result_list = []
+        batch_size = len(text_index)
+        for batch_idx in range(batch_size):
+            char_list = []
+            conf_list = []
+            for idx in range(len(text_index[batch_idx])):
+                try:
+                    char_idx = self.character[int(text_index[batch_idx][idx])]
+                except:
+                    continue
+                if char_idx == '</s>':  # end
+                    break
+                char_list.append(char_idx)
+                if text_prob is not None:
+                    conf_list.append(text_prob[batch_idx][idx])
+                else:
+                    conf_list.append(1)
+            text = ''.join(char_list)
+            result_list.append((text.lower(), np.mean(conf_list).tolist()))
+        return result_list
+
+
+class ViTSTRLabelDecode(NRTRLabelDecode):
+    """ Convert between text-label and text-index """
+
+    def __init__(self, character_dict_path=None, use_space_char=False,
+                 **kwargs):
+        super(ViTSTRLabelDecode, self).__init__(character_dict_path,
+                                                use_space_char)
+
+    def __call__(self, preds, label=None, *args, **kwargs):
+        # if isinstance(preds, paddle.Tensor):
+        #     preds = preds[:, 1:].numpy()
+        # else:
+        #     preds = preds[:, 1:]
+        preds = preds[:, 1:].numpy()
+        preds_idx = preds.argmax(axis=2)
+        preds_prob = preds.max(axis=2)
+        text = self.decode(preds_idx, preds_prob, is_remove_duplicate=False)
+        if label is None:
+            return text
+        label = self.decode(label[:, 1:])
+        return text, label
+
+    def add_special_char(self, dict_character):
+        dict_character = ['<s>', '</s>'] + dict_character
+        return dict_character
+
+
+class ABINetLabelDecode(NRTRLabelDecode):
+    """ Convert between text-label and text-index """
+
+    def __init__(self, character_dict_path=None, use_space_char=False,
+                 **kwargs):
+        super(ABINetLabelDecode, self).__init__(character_dict_path,
+                                                use_space_char)
+
+    def __call__(self, preds, label=None, *args, **kwargs):
+        if isinstance(preds, dict):
+            preds = preds['align'][-1].numpy()
+        # elif isinstance(preds, paddle.Tensor):
+        #     preds = preds.numpy()
+        # else:
+        #     preds = preds
+        preds = preds.numpy()
+        preds_idx = preds.argmax(axis=2)
+        preds_prob = preds.max(axis=2)
+        text = self.decode(preds_idx, preds_prob, is_remove_duplicate=False)
+        if label is None:
+            return text
+        label = self.decode(label)
+        return text, label
+
+    def add_special_char(self, dict_character):
+        dict_character = ['</s>'] + dict_character
+        return dict_character
+
+
+class SPINLabelDecode(AttnLabelDecode):
+    """ Convert between text-label and text-index """
+
+    def __init__(self, character_dict_path=None, use_space_char=False,
+                 **kwargs):
+        super(SPINLabelDecode, self).__init__(character_dict_path,
+                                              use_space_char)
+
+    def add_special_char(self, dict_character):
+        self.beg_str = "sos"
+        self.end_str = "eos"
+        dict_character = dict_character
+        dict_character = [self.beg_str] + [self.end_str] + dict_character
+        return dict_character
+
+
+# class VLLabelDecode(BaseRecLabelDecode):
+#     """ Convert between text-label and text-index """
+
+#     def __init__(self, character_dict_path=None, use_space_char=False,
+#                  **kwargs):
+#         super(VLLabelDecode, self).__init__(character_dict_path, use_space_char)
+#         self.max_text_length = kwargs.get('max_text_length', 25)
+#         self.nclass = len(self.character) + 1
+#         self.character = self.character[10:] + self.character[
+#             1:10] + [self.character[0]]
+
+#     def decode(self, text_index, text_prob=None, is_remove_duplicate=False):
+#         """ convert text-index into text-label. """
+#         result_list = []
+#         ignored_tokens = self.get_ignored_tokens()
+#         batch_size = len(text_index)
+#         for batch_idx in range(batch_size):
+#             selection = np.ones(len(text_index[batch_idx]), dtype=bool)
+#             if is_remove_duplicate:
+#                 selection[1:] = text_index[batch_idx][1:] != text_index[
+#                     batch_idx][:-1]
+#             for ignored_token in ignored_tokens:
+#                 selection &= text_index[batch_idx] != ignored_token
+
+#             char_list = [
+#                 self.character[text_id - 1]
+#                 for text_id in text_index[batch_idx][selection]
+#             ]
+#             if text_prob is not None:
+#                 conf_list = text_prob[batch_idx][selection]
+#             else:
+#                 conf_list = [1] * len(selection)
+#             if len(conf_list) == 0:
+#                 conf_list = [0]
+
+#             text = ''.join(char_list)
+#             result_list.append((text, np.mean(conf_list).tolist()))
+#         return result_list
+
+#     def __call__(self, preds, label=None, length=None, *args, **kwargs):
+#         if len(preds) == 2:  # eval mode
+#             text_pre, x = preds
+#             b = text_pre.shape[1]
+#             lenText = self.max_text_length
+#             nsteps = self.max_text_length
+
+#             if not isinstance(text_pre, paddle.Tensor):
+#                 text_pre = paddle.to_tensor(text_pre, dtype='float32')
+
+#             out_res = paddle.zeros(
+#                 shape=[lenText, b, self.nclass], dtype=x.dtype)
+#             out_length = paddle.zeros(shape=[b], dtype=x.dtype)
+#             now_step = 0
+#             for _ in range(nsteps):
+#                 if 0 in out_length and now_step < nsteps:
+#                     tmp_result = text_pre[now_step, :, :]
+#                     out_res[now_step] = tmp_result
+#                     tmp_result = tmp_result.topk(1)[1].squeeze(axis=1)
+#                     for j in range(b):
+#                         if out_length[j] == 0 and tmp_result[j] == 0:
+#                             out_length[j] = now_step + 1
+#                     now_step += 1
+#             for j in range(0, b):
+#                 if int(out_length[j]) == 0:
+#                     out_length[j] = nsteps
+#             start = 0
+#             output = paddle.zeros(
+#                 shape=[int(out_length.sum()), self.nclass], dtype=x.dtype)
+#             for i in range(0, b):
+#                 cur_length = int(out_length[i])
+#                 output[start:start + cur_length] = out_res[0:cur_length, i, :]
+#                 start += cur_length
+#             net_out = output
+#             length = out_length
+
+#         else:  # train mode
+#             net_out = preds[0]
+#             length = length
+#             net_out = paddle.concat([t[:l] for t, l in zip(net_out, length)])
+#         text = []
+#         if not isinstance(net_out, paddle.Tensor):
+#             net_out = paddle.to_tensor(net_out, dtype='float32')
+#         net_out = F.softmax(net_out, axis=1)
+#         for i in range(0, length.shape[0]):
+#             preds_idx = net_out[int(length[:i].sum()):int(length[:i].sum(
+#             ) + length[i])].topk(1)[1][:, 0].tolist()
+#             preds_text = ''.join([
+#                 self.character[idx - 1]
+#                 if idx > 0 and idx <= len(self.character) else ''
+#                 for idx in preds_idx
+#             ])
+#             preds_prob = net_out[int(length[:i].sum()):int(length[:i].sum(
+#             ) + length[i])].topk(1)[0][:, 0]
+#             preds_prob = paddle.exp(
+#                 paddle.log(preds_prob).sum() / (preds_prob.shape[0] + 1e-6))
+#             text.append((preds_text, preds_prob.numpy()[0]))
+#         if label is None:
+#             return text
+#         label = self.decode(label)
+#         return text, label
+
diff --git a/examples/RetinaFace/README.md b/examples/RetinaFace/README.md
index 01f9e4b..469a236 100644
--- a/examples/RetinaFace/README.md
+++ b/examples/RetinaFace/README.md
@@ -1,4 +1,8 @@
 # RetinaFace
+
+## Current Support Platform
+RK3566, RK3568, RK3588, RK3562, RK1808, RV1109, RV1126
+
 ## Model Source
 The model used in this example comes from the following open source projects:
 https://github.com/biubug6/Pytorch_Retinaface
@@ -26,23 +30,36 @@ python3 convert_to_onnx.py -m ./weights/mobilenet0.25_Final.pth --network mobile
 ## Convert ONNX model to RKNN model
 
 ```
-cd ./python
-python RetinaFace.py <onnx_model> <TARGET_PLATFORM> [dtype(optional)] [output_rknn_path(optional)]
-# such as: python RetinaFace.py ../model/RetinaFace_mobile320.onnx rk3566
+cd python
+python convert.py <onnx_model> <TARGET_PLATFORM> <dtype(optional)> <output_rknn_path(optional)>
+# such as: python convert.py ../model/RetinaFace_mobile320.onnx rk3566
 # output model will be saved as ../model/RetinaFace.rknn
 ```
 *Description:*
 
 - <onnx_model> should be the ONNX model path.
-- <TARGET_PLATFORM>  could be specified as RK3562, RK3566, RK3568, RK3588 according to board SOC version.
-- <dtype\> is *optional*, could be specified as `i8` or `fp`, `i8` means to do quantization, `fp` means no to do quantization, default is `i8`.
+- <TARGET_PLATFORM>  could be specified as RK3562, RK3566, RK3568, RK3588, RK1808, RV1109, RV1126 according to board SOC version.
+- <dtype\> is *optional*, could be specified as `i8` or `u8`, `i8`/`u8` means to do quantization, default is `i8`/`u8`.
 - <output_rknn_path> is *optional*, used to specify the saving path of the RKNN model, default save path is `../model/RetinaFace.rknn`
 
-After execution, the corresponding rknn model will be generated and rknn will be used to perform connected reasoning on the model. The inference result will be saved as the image result.jpg.
 
+## Python Demo
+
+*Usage:*
+
+```shell
+cd python
+# Inference with RKNN model
+python RetinaFace.py --model_path <rknn_model> --target <TARGET_PLATFORM>
+# The inference result will be saved as the image result.jpg.
+```
+*Description:*
+- <TARGET_PLATFORM>: Specified as the NPU platform name. Such as 'rk3588'.
+- <rknn_model>: Specified as the model path.
 
 
 ## Android Demo
+**Note: RK1808, RV1109, RV1126 does not support Android.**
 
 ### Compiling && Building
 
diff --git a/examples/RetinaFace/cpp/CMakeLists.txt b/examples/RetinaFace/cpp/CMakeLists.txt
index dd1562d..788dd6a 100644
--- a/examples/RetinaFace/cpp/CMakeLists.txt
+++ b/examples/RetinaFace/cpp/CMakeLists.txt
@@ -14,11 +14,17 @@ add_subdirectory(${CMAKE_CURRENT_SOURCE_DIR}/../../../utils/ utils.out)
 
 set(CMAKE_INSTALL_RPATH "$ORIGIN/../lib")
 
+if (TARGET_SOC STREQUAL "rk1808" OR TARGET_SOC STREQUAL "rv1109" OR TARGET_SOC STREQUAL "rv1126")
+    set(retinaface_file rknpu1/retinaface.cc)
+else()
+    set(retinaface_file rknpu2/retinaface.cc)
+endif()
+
 file(GLOB SRCS ${CMAKE_CURRENT_SOURCE_DIR}/*.cc)
 
 add_executable(${PROJECT_NAME}
     main.cc
-    rknpu2/retinaface.cc
+    ${retinaface_file}
 )
 
 target_link_libraries(${PROJECT_NAME}
@@ -26,6 +32,7 @@ target_link_libraries(${PROJECT_NAME}
     imageutils
     imagedrawing
     ${LIBRKNNRT}
+    dl
 )
 
 if (CMAKE_SYSTEM_NAME STREQUAL "Android")
diff --git a/examples/RetinaFace/cpp/rknpu1/retinaface.cc b/examples/RetinaFace/cpp/rknpu1/retinaface.cc
new file mode 100644
index 0000000..f14e380
--- /dev/null
+++ b/examples/RetinaFace/cpp/rknpu1/retinaface.cc
@@ -0,0 +1,379 @@
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+#include <math.h>
+
+#include "retinaface.h"
+#include "common.h"
+#include "file_utils.h"
+#include "image_utils.h"
+#include "rknn_box_priors.h"
+
+#define NMS_THRESHOLD 0.4
+#define CONF_THRESHOLD 0.5
+#define VIS_THRESHOLD 0.4
+
+static int clamp(int x, int min, int max) {
+    if (x > max) return max;
+    if (x < min) return min;
+    return x;
+}
+
+static void dump_tensor_attr(rknn_tensor_attr *attr) {
+    printf("  index=%d, name=%s, n_dims=%d, dims=[%d, %d, %d, %d], n_elems=%d, size=%d, fmt=%s, type=%s, qnt_type=%s, "
+           "zp=%d, scale=%f\n",
+           attr->index, attr->name, attr->n_dims, attr->dims[3], attr->dims[2], attr->dims[1], attr->dims[0],
+           attr->n_elems, attr->size, get_format_string(attr->fmt), get_type_string(attr->type),
+           get_qnt_type_string(attr->qnt_type), attr->zp, attr->scale);
+}
+
+static float CalculateOverlap(float xmin0, float ymin0, float xmax0, float ymax0, float xmin1, float ymin1, float xmax1, float ymax1) {
+    float w = fmax(0.f, fmin(xmax0, xmax1) - fmax(xmin0, xmin1) + 1);
+    float h = fmax(0.f, fmin(ymax0, ymax1) - fmax(ymin0, ymin1) + 1);
+    float i = w * h;
+    float u = (xmax0 - xmin0 + 1) * (ymax0 - ymin0 + 1) + (xmax1 - xmin1 + 1) * (ymax1 - ymin1 + 1) - i;
+    return u <= 0.f ? 0.f : (i / u);
+}
+
+static int nms(int validCount, float *outputLocations, int order[], float threshold, int width, int height) {
+    for (int i = 0; i < validCount; ++i) {
+        if (order[i] == -1) {
+            continue;
+        }
+        int n = order[i];
+        for (int j = i + 1; j < validCount; ++j) {
+            int m = order[j];
+            if (m == -1) {
+                continue;
+            }
+            float xmin0 = outputLocations[n * 4 + 0] * width;
+            float ymin0 = outputLocations[n * 4 + 1] * height;
+            float xmax0 = outputLocations[n * 4 + 2] * width;
+            float ymax0 = outputLocations[n * 4 + 3] * height;
+
+            float xmin1 = outputLocations[m * 4 + 0] * width;
+            float ymin1 = outputLocations[m * 4 + 1] * height;
+            float xmax1 = outputLocations[m * 4 + 2] * width;
+            float ymax1 = outputLocations[m * 4 + 3] * height;
+
+            float iou = CalculateOverlap(xmin0, ymin0, xmax0, ymax0, xmin1, ymin1, xmax1, ymax1);
+
+            if (iou > threshold) {
+                order[j] = -1;
+            }
+        }
+    }
+    return 0;
+}
+
+static int quick_sort_indice_inverse(float *input, int left, int right, int *indices) {
+    float key;
+    int key_index;
+    int low = left;
+    int high = right;
+    if (left < right) {
+        key_index = indices[left];
+        key = input[left];
+        while (low < high) {
+            while (low < high && input[high] <= key) {
+                high--;
+            }
+            input[low] = input[high];
+            indices[low] = indices[high];
+            while (low < high && input[low] >= key) {
+                low++;
+            }
+            input[high] = input[low];
+            indices[high] = indices[low];
+        }
+        input[low] = key;
+        indices[low] = key_index;
+        quick_sort_indice_inverse(input, left, low - 1, indices);
+        quick_sort_indice_inverse(input, low + 1, right, indices);
+    }
+    return low;
+}
+
+static int filterValidResult(float *scores, float *loc, float *landms, const float boxPriors[][4], int model_in_h, int model_in_w,
+                             int filter_indice[], float *props, float threshold, const int num_results) {
+    int validCount = 0;
+    const float VARIANCES[2] = {0.1, 0.2};
+    // Scale them back to the input size.
+    for (int i = 0; i < num_results; ++i) {
+        float face_score = scores[i * 2 + 1];
+        if (face_score > threshold) {
+            filter_indice[validCount] = i;
+            props[validCount] = face_score;
+            //decode location to origin position
+            float xcenter = loc[i * 4 + 0] * VARIANCES[0] * boxPriors[i][2] + boxPriors[i][0];
+            float ycenter = loc[i * 4 + 1] * VARIANCES[0] * boxPriors[i][3] + boxPriors[i][1];
+            float w = (float) expf(loc[i * 4 + 2] * VARIANCES[1] ) * boxPriors[i][2];
+            float h = (float) expf(loc[i * 4 + 3] * VARIANCES[1]) * boxPriors[i][3];
+
+            float xmin = xcenter - w * 0.5f;
+            float ymin = ycenter - h * 0.5f;
+            float xmax = xmin + w;
+            float ymax = ymin + h;
+
+            loc[i * 4 + 0] = xmin ;
+            loc[i * 4 + 1] = ymin ;
+            loc[i * 4 + 2] = xmax ;
+            loc[i * 4 + 3] = ymax ;
+            for (int j = 0; j < 5; ++j) {
+                landms[i * 10 + 2 * j] = landms[i * 10 + 2 * j] * VARIANCES[0] * boxPriors[i][2] + boxPriors[i][0];
+                landms[i * 10 + 2 * j + 1] = landms[i * 10 + 2 * j + 1] * VARIANCES[0] * boxPriors[i][3] + boxPriors[i][1];
+            }
+            ++validCount;
+        }
+    }
+
+    return validCount;
+}
+
+static int post_process_retinaface(rknn_app_context_t *app_ctx, image_buffer_t *src_img, rknn_output outputs[], retinaface_result *result, letterbox_t *letter_box) {
+    float *location = (float *)outputs[0].buf;
+    float *scores = (float *)outputs[1].buf;
+    float *landms = (float *)outputs[2].buf;
+    const float (*prior_ptr)[4];
+    int num_priors = 0;
+    if (app_ctx->model_height == 320) {
+        num_priors = 4200;//anchors box number
+        prior_ptr = BOX_PRIORS_320;
+    } else if(app_ctx->model_height == 640){
+        num_priors = 16800;//anchors box number
+        prior_ptr = BOX_PRIORS_640;
+    }
+    else
+    {
+        printf("model_shape error!!!\n");
+        return -1;
+
+    }
+
+    int filter_indices[num_priors];
+    float props[num_priors];
+
+    memset(filter_indices, 0, sizeof(int)*num_priors);
+    memset(props, 0, sizeof(float)*num_priors);
+
+    int validCount = filterValidResult(scores, location, landms, prior_ptr, app_ctx->model_height, app_ctx->model_width,
+                                       filter_indices, props, CONF_THRESHOLD, num_priors);
+
+    quick_sort_indice_inverse(props, 0, validCount - 1, filter_indices);
+    nms(validCount, location, filter_indices, NMS_THRESHOLD, src_img->width, src_img->height);
+
+
+    int last_count = 0;
+    result->count = 0;
+    for (int i = 0; i < validCount; ++i) {
+        if (last_count >= 128) {
+            printf("Warning: detected more than 128 faces, can not handle that");
+            break;
+        }
+        if (filter_indices[i] == -1 || props[i] < VIS_THRESHOLD) {
+            continue;
+        }
+
+        int n = filter_indices[i];
+
+        float x1 = location[n * 4 + 0] * app_ctx->model_width - letter_box->x_pad;
+        float y1 = location[n * 4 + 1] * app_ctx->model_height - letter_box->y_pad;
+        float x2 = location[n * 4 + 2] * app_ctx->model_width - letter_box->x_pad;
+        float y2 = location[n * 4 + 3] * app_ctx->model_height - letter_box->y_pad;
+        int model_in_w = app_ctx->model_width;
+        int model_in_h = app_ctx->model_height;
+        result->object[last_count].box.left   = (int)(clamp(x1, 0, model_in_w) / letter_box->scale); // Face box
+        result->object[last_count].box.top    = (int)(clamp(y1, 0, model_in_h) / letter_box->scale);
+        result->object[last_count].box.right  = (int)(clamp(x2, 0, model_in_w) / letter_box->scale);
+        result->object[last_count].box.bottom = (int)(clamp(y2, 0, model_in_h) / letter_box->scale);
+        result->object[last_count].score = props[i];  // Confidence
+
+        for (int j = 0; j < 5; ++j) { // Facial feature points
+            float ponit_x = landms[n * 10 + 2 * j] * app_ctx->model_width - letter_box->x_pad;
+            float ponit_y = landms[n * 10 + 2 * j + 1] * app_ctx->model_height - letter_box->y_pad;
+            result->object[last_count].ponit[j].x = (int)(clamp(ponit_x, 0, model_in_w) / letter_box->scale);
+            result->object[last_count].ponit[j].y = (int)(clamp(ponit_y, 0, model_in_h) / letter_box->scale);
+        }
+        last_count++;
+    }
+
+    result->count = last_count;
+
+    return 0;
+}
+
+int init_retinaface_model(const char *model_path, rknn_app_context_t *app_ctx) {
+    int ret;
+    int model_len = 0;
+    char *model;
+    rknn_context ctx = 0;
+
+    // Load RKNN Model
+    model_len = read_data_from_file(model_path, &model);
+    if (model == NULL) {
+        printf("load_model fail!\n");
+        return -1;
+    }
+
+    ret = rknn_init(&ctx, model, model_len, 0);
+    free(model);
+    if (ret < 0) {
+        printf("rknn_init fail! ret=%d\n", ret);
+        return -1;
+    }
+
+    // Get Model Input Output Number
+    rknn_input_output_num io_num;
+    ret = rknn_query(ctx, RKNN_QUERY_IN_OUT_NUM, &io_num, sizeof(io_num));
+    if (ret != RKNN_SUCC) {
+        printf("rknn_query fail! ret=%d\n", ret);
+        return -1;
+    }
+    printf("model input num: %d, output num: %d\n", io_num.n_input, io_num.n_output);
+
+    // Get Model Input Info
+    printf("input tensors:\n");
+    rknn_tensor_attr input_attrs[io_num.n_input];
+    memset(input_attrs, 0, sizeof(input_attrs));
+    for (int i = 0; i < io_num.n_input; i++) {
+        input_attrs[i].index = i;
+        ret = rknn_query(ctx, RKNN_QUERY_INPUT_ATTR, &(input_attrs[i]), sizeof(rknn_tensor_attr));
+        if (ret != RKNN_SUCC) {
+            printf("rknn_query fail! ret=%d\n", ret);
+            return -1;
+        }
+        dump_tensor_attr(&(input_attrs[i]));
+    }
+
+    // Get Model Output Info
+    printf("output tensors:\n");
+    rknn_tensor_attr output_attrs[io_num.n_output];
+    memset(output_attrs, 0, sizeof(output_attrs));
+    for (int i = 0; i < io_num.n_output; i++) {
+        output_attrs[i].index = i;
+        ret = rknn_query(ctx, RKNN_QUERY_OUTPUT_ATTR, &(output_attrs[i]), sizeof(rknn_tensor_attr));
+        if (ret != RKNN_SUCC) {
+            printf("rknn_query fail! ret=%d\n", ret);
+            return -1;
+        }
+        dump_tensor_attr(&(output_attrs[i]));
+    }
+
+    // Set to context
+    app_ctx->rknn_ctx = ctx;
+    app_ctx->io_num = io_num;
+    app_ctx->input_attrs = (rknn_tensor_attr *)malloc(io_num.n_input * sizeof(rknn_tensor_attr));
+    memcpy(app_ctx->input_attrs, input_attrs, io_num.n_input * sizeof(rknn_tensor_attr));
+    app_ctx->output_attrs = (rknn_tensor_attr *)malloc(io_num.n_output * sizeof(rknn_tensor_attr));
+    memcpy(app_ctx->output_attrs, output_attrs, io_num.n_output * sizeof(rknn_tensor_attr));
+
+    if (input_attrs[0].fmt == RKNN_TENSOR_NCHW) {
+        printf("model is NCHW input fmt\n");
+        app_ctx->model_channel = input_attrs[0].dims[2];
+        app_ctx->model_height  = input_attrs[0].dims[1];
+        app_ctx->model_width   = input_attrs[0].dims[0];
+    } else {
+        printf("model is NHWC input fmt\n");
+        app_ctx->model_height = input_attrs[0].dims[2];
+        app_ctx->model_width = input_attrs[0].dims[1];
+        app_ctx->model_channel = input_attrs[0].dims[0];
+    }
+    printf("model input height=%d, width=%d, channel=%d\n",
+           app_ctx->model_height, app_ctx->model_width, app_ctx->model_channel);
+
+    return 0;
+}
+
+int release_retinaface_model(rknn_app_context_t *app_ctx) {
+    if (app_ctx->input_attrs != NULL) {
+        free(app_ctx->input_attrs);
+        app_ctx->input_attrs = NULL;
+    }
+    if (app_ctx->output_attrs != NULL) {
+        free(app_ctx->output_attrs);
+        app_ctx->output_attrs = NULL;
+    }
+    if (app_ctx->rknn_ctx != 0) {
+        rknn_destroy(app_ctx->rknn_ctx);
+        app_ctx->rknn_ctx = 0;
+    }
+    return 0;
+}
+
+int inference_retinaface_model(rknn_app_context_t *app_ctx, image_buffer_t *src_img, retinaface_result *out_result) {
+    int ret;
+    image_buffer_t img;
+    letterbox_t letter_box;
+    rknn_input inputs[1];
+    rknn_output outputs[app_ctx->io_num.n_output];
+    memset(&img, 0, sizeof(image_buffer_t));
+    memset(inputs, 0, sizeof(inputs));
+    memset(outputs, 0, sizeof(rknn_output) * 3);
+    memset(&letter_box, 0, sizeof(letterbox_t));
+    int bg_color = 114;//letterbox background pixel
+
+    // Pre Process
+    img.width = app_ctx->model_width;
+    img.height = app_ctx->model_height;
+    img.format = IMAGE_FORMAT_RGB888;
+    img.size = get_image_size(&img);
+    img.virt_addr = (unsigned char *)malloc(img.size);
+
+    if (img.virt_addr == NULL) {
+        printf("malloc buffer size:%d fail!\n", img.size);
+        return -1;
+    }
+
+    ret = convert_image_with_letterbox(src_img, &img, &letter_box, bg_color);
+    if (ret < 0) {
+        printf("convert_image fail! ret=%d\n", ret);
+        return -1;
+    }
+
+    // Set Input Data
+    inputs[0].index = 0;
+    inputs[0].type  = RKNN_TENSOR_UINT8;
+    inputs[0].fmt   = RKNN_TENSOR_NHWC;
+    inputs[0].size  = app_ctx->model_width * app_ctx->model_height * app_ctx->model_channel;
+    inputs[0].buf   = img.virt_addr;
+
+    ret = rknn_inputs_set(app_ctx->rknn_ctx, 1, inputs);
+    if (ret < 0) {
+        printf("rknn_input_set fail! ret=%d\n", ret);
+        return -1;
+    }
+
+    // Run
+    printf("rknn_run\n");
+    ret = rknn_run(app_ctx->rknn_ctx, nullptr);
+    if (ret < 0) {
+        printf("rknn_run fail! ret=%d\n", ret);
+        return -1;
+    }
+
+    // Get Output
+    for (int i = 0; i < app_ctx->io_num.n_output; i++) {
+        outputs[i].index = i;
+        outputs[i].want_float = 1;
+    }
+    ret = rknn_outputs_get(app_ctx->rknn_ctx, 3, outputs, NULL);
+    if (ret < 0) {
+        printf("rknn_outputs_get fail! ret=%d\n", ret);
+        goto out;
+    }
+
+    ret = post_process_retinaface(app_ctx, src_img, outputs, out_result, &letter_box);
+    if (ret < 0) {
+        printf("post_process_retinaface fail! ret=%d\n", ret);
+        return -1;
+    }
+    // Remeber to release rknn output
+    rknn_outputs_release(app_ctx->rknn_ctx, 3, outputs);
+
+out:
+    if (img.virt_addr != NULL) {
+        free(img.virt_addr);
+    }
+
+    return ret;
+}
\ No newline at end of file
diff --git a/examples/RetinaFace/cpp/rknpu2/retinaface.cc b/examples/RetinaFace/cpp/rknpu2/retinaface.cc
index 98d6577..ace445c 100644
--- a/examples/RetinaFace/cpp/rknpu2/retinaface.cc
+++ b/examples/RetinaFace/cpp/rknpu2/retinaface.cc
@@ -13,7 +13,7 @@
 #define CONF_THRESHOLD 0.5
 #define VIS_THRESHOLD 0.4
 
-int clamp(int x, int min, int max) {
+static int clamp(int x, int min, int max) {
     if (x > max) return max;
     if (x < min) return min;
     return x;
@@ -130,7 +130,7 @@ static int filterValidResult(float *scores, float *loc, float *landms, const flo
     return validCount;
 }
 
-int post_process_retinaface(rknn_app_context_t *app_ctx, image_buffer_t *src_img, rknn_output outputs[], retinaface_result *result, letterbox_t *letter_box) {
+static int post_process_retinaface(rknn_app_context_t *app_ctx, image_buffer_t *src_img, rknn_output outputs[], retinaface_result *result, letterbox_t *letter_box) {
     float *location = (float *)outputs[0].buf;
     float *scores = (float *)outputs[1].buf;
     float *landms = (float *)outputs[2].buf;
@@ -182,13 +182,13 @@ int post_process_retinaface(rknn_app_context_t *app_ctx, image_buffer_t *src_img
         float y2 = location[n * 4 + 3] * app_ctx->model_height - letter_box->y_pad;
         int model_in_w = app_ctx->model_width;
         int model_in_h = app_ctx->model_height;
-        result->object[last_count].box.left   = (int)(clamp(x1, 0, model_in_w) / letter_box->scale);//人脸框
+        result->object[last_count].box.left   = (int)(clamp(x1, 0, model_in_w) / letter_box->scale); // Face box
         result->object[last_count].box.top    = (int)(clamp(y1, 0, model_in_h) / letter_box->scale);
         result->object[last_count].box.right  = (int)(clamp(x2, 0, model_in_w) / letter_box->scale);
         result->object[last_count].box.bottom = (int)(clamp(y2, 0, model_in_h) / letter_box->scale);
-        result->object[last_count].score = props[i];//置信度
+        result->object[last_count].score = props[i]; // Confidence
 
-        for (int j = 0; j < 5; ++j) { //5点人脸特征点
+        for (int j = 0; j < 5; ++j) { // Facial feature points
             float ponit_x = landms[n * 10 + 2 * j] * app_ctx->model_width - letter_box->x_pad;
             float ponit_y = landms[n * 10 + 2 * j + 1] * app_ctx->model_height - letter_box->y_pad;
             result->object[last_count].ponit[j].x = (int)(clamp(ponit_x, 0, model_in_w) / letter_box->scale);
@@ -285,10 +285,6 @@ int init_retinaface_model(const char *model_path, rknn_app_context_t *app_ctx) {
 }
 
 int release_retinaface_model(rknn_app_context_t *app_ctx) {
-    if (app_ctx->rknn_ctx != 0) {
-        rknn_destroy(app_ctx->rknn_ctx);
-        app_ctx->rknn_ctx = 0;
-    }
     if (app_ctx->input_attrs != NULL) {
         free(app_ctx->input_attrs);
         app_ctx->input_attrs = NULL;
@@ -297,6 +293,10 @@ int release_retinaface_model(rknn_app_context_t *app_ctx) {
         free(app_ctx->output_attrs);
         app_ctx->output_attrs = NULL;
     }
+    if (app_ctx->rknn_ctx != 0) {
+        rknn_destroy(app_ctx->rknn_ctx);
+        app_ctx->rknn_ctx = 0;
+    }
     return 0;
 }
 
diff --git a/examples/RetinaFace/python/RetinaFace.py b/examples/RetinaFace/python/RetinaFace.py
index 2e266d6..1855c9a 100644
--- a/examples/RetinaFace/python/RetinaFace.py
+++ b/examples/RetinaFace/python/RetinaFace.py
@@ -4,14 +4,12 @@
 import urllib.request
 import time
 import numpy as np
+import argparse
 import cv2
 from math import ceil
 from itertools import product as product
 
 from rknn.api import RKNN
-DATASET_PATH = '../model/dataset.txt'
-DEFAULT_RKNN_PATH = '../model/RetinaFace.rknn'
-DEFAULT_QUANT = True
 
 def letterbox_resize(image, size, bg_color):
     """
@@ -27,15 +25,15 @@ def letterbox_resize(image, size, bg_color):
     target_width, target_height = size
     image_height, image_width, _ = image.shape
 
-    # 计算调整后的图像尺寸
+    # Calculate the adjusted image size
     aspect_ratio = min(target_width / image_width, target_height / image_height)
     new_width = int(image_width * aspect_ratio)
     new_height = int(image_height * aspect_ratio)
 
-    # 使用 cv2.resize() 进行等比缩放
+    # Use cv2.resize() for proportional scaling
     image = cv2.resize(image, (new_width, new_height), interpolation=cv2.INTER_AREA)
 
-    # 创建新的画布并进行填充
+    # Create a new canvas and fill it
     result_image = np.ones((target_height, target_width, 3), dtype=np.uint8) * bg_color
     offset_x = (target_width - new_width) // 2
     offset_y = (target_height - new_height) // 2
@@ -61,7 +59,6 @@ def PriorBox(image_size): #image_size Support (320,320) and (640,640)
     print("image_size:",image_size," num_priors=",output.shape[0])
     return output
 
-
 def box_decode(loc, priors):
     """Decode locations from predictions using priors to undo
     the encoding we did for offset regression at train time.
@@ -82,7 +79,6 @@ def box_decode(loc, priors):
     boxes[:, 2:] += boxes[:, :2]
     return boxes
 
-
 def decode_landm(pre, priors):
     """Decode landm from predictions using priors to undo
     the encoding we did for offset regression at train time.
@@ -105,7 +101,6 @@ def decode_landm(pre, priors):
     ), axis=1)
     return landmarks
 
-
 def nms(dets, thresh):
     """Pure Python NMS baseline."""
     x1 = dets[:, 0]
@@ -136,66 +131,32 @@ def nms(dets, thresh):
 
     return keep
 
-def parse_arg():
-    if len(sys.argv) < 3:
-        print("Usage: python3 {} onnx_model_path [platform] [dtype(optional)] [output_rknn_path(optional)]".format(sys.argv[0]));
-        print("       platform choose from [rk3562,rk3566,rk3568,rk3588]")
-        print("       dtype choose from    [i8, fp]")
-        exit(1)
-
-    model_path = sys.argv[1]
-    platform = sys.argv[2]
-
-    do_quant = DEFAULT_QUANT
-    if len(sys.argv) > 3:
-        model_type = sys.argv[3]
-        if model_type not in ['i8', 'fp']:
-            print("ERROR: Invalid model type: {}".format(model_type))
-            exit(1)
-        elif model_type == 'i8':
-            do_quant = True
-        else:
-            do_quant = False
-
-    if len(sys.argv) > 4:
-        output_path = sys.argv[4]
-    else:
-        output_path = DEFAULT_RKNN_PATH
-
-    return model_path, platform, do_quant, output_path
-
 if __name__ == '__main__':
-    model_path, platform, do_quant, output_path = parse_arg()
-    # Create RKNN object
-    rknn = RKNN()
-
-    # Pre-process config
-    print('--> Config model')
-    rknn.config(mean_values=[[104, 117, 123]], std_values=[[1, 1, 1]], target_platform=platform,
-                quantized_algorithm="normal", quant_img_RGB2BGR=True)  # mmse
-    print('done')
+    parser = argparse.ArgumentParser(description='RetinaFace Python Demo', add_help=True)
+    # basic params
+    parser.add_argument('--model_path', type=str, required=True,
+                        help='model path, could be .rknn file')
+    parser.add_argument('--target', type=str,
+                        default='rk3566', help='target RKNPU platform')
+    parser.add_argument('--device_id', type=str,
+                        default=None, help='device id')
+    args = parser.parse_args()
 
-    # Load model
-    print('--> Loading model')
-    ret = rknn.load_onnx(model=model_path)
-    if ret != 0:
-        print('Load model failed!')
-        exit(ret)
-    print('done')
+    # Create RKNN object
+    rknn = RKNN(verbose=True)
 
-    # Build model
-    print('--> Building model')
-    ret = rknn.build(do_quantization=do_quant, dataset=DATASET_PATH)
+    # Load RKNN model
+    ret = rknn.load_rknn(args.model_path)
     if ret != 0:
-        print('Build model failed!')
+        print('Load RKNN model \"{}\" failed!'.format(args.model_path))
         exit(ret)
     print('done')
 
-    # Export rknn model
-    print('--> Export rknn model')
-    ret = rknn.export_rknn(output_path)
+    # Init runtime environment
+    print('--> Init runtime environment')
+    ret = rknn.init_runtime(target=args.target)
     if ret != 0:
-        print('Export rknn model failed!')
+        print('Init runtime environment failed!')
         exit(ret)
     print('done')
 
@@ -206,14 +167,6 @@ def parse_arg():
     letterbox_img, aspect_ratio, offset_x, offset_y = letterbox_resize(img, (model_height,model_width), 114)  # letterbox缩放
     infer_img = letterbox_img[..., ::-1]  # BGR2RGB
 
-    # Init runtime environment
-    print('--> Init runtime environment')
-    ret = rknn.init_runtime()
-    if ret != 0:
-        print('Init runtime environment failed!')
-        exit(ret)
-    print('done')
-
     # Inference
     print('--> Running model')
     outputs = rknn.inference(inputs=[infer_img])
diff --git a/examples/RetinaFace/python/convert.py b/examples/RetinaFace/python/convert.py
new file mode 100644
index 0000000..2932b4a
--- /dev/null
+++ b/examples/RetinaFace/python/convert.py
@@ -0,0 +1,73 @@
+import sys
+from rknn.api import RKNN
+
+DATASET_PATH = '../model/dataset.txt'
+DEFAULT_RKNN_PATH = '../model/RetinaFace.rknn'
+DEFAULT_QUANT = True
+
+def parse_arg():
+    if len(sys.argv) < 3:
+        print("Usage: python3 {} onnx_model_path [platform] [dtype(optional)] [output_rknn_path(optional)]".format(sys.argv[0]));
+        print("       platform choose from [rk3562, rk3566, rk3568, rk3588, rk1808, rv1109, rv1126]")
+        print("       dtype choose from    [i8] for [rk3562, rk3566, rk3568, rk3588]")
+        print("       dtype choose from    [u8] for [rk1808, rv1109, rv1126]")
+        exit(1)
+
+    model_path = sys.argv[1]
+    platform = sys.argv[2]
+
+    do_quant = DEFAULT_QUANT
+    if len(sys.argv) > 3:
+        model_type = sys.argv[3]
+        if model_type not in ['i8', 'u8', 'fp']:
+            print("ERROR: Invalid model type: {}".format(model_type))
+            exit(1)
+        elif model_type in ['i8', 'u8']:
+            do_quant = True
+        else:
+            do_quant = False
+
+    if len(sys.argv) > 4:
+        output_path = sys.argv[4]
+    else:
+        output_path = DEFAULT_RKNN_PATH
+
+    return model_path, platform, do_quant, output_path
+
+if __name__ == '__main__':
+    model_path, platform, do_quant, output_path = parse_arg()
+
+    # Create RKNN object
+    rknn = RKNN(verbose=False)
+
+    # Pre-process config
+    print('--> Config model')
+    rknn.config(mean_values=[[104, 117, 123]], std_values=[[1, 1, 1]], target_platform=platform)
+    print('done')
+
+    # Load model
+    print('--> Loading model')
+    ret = rknn.load_onnx(model=model_path)
+    if ret != 0:
+        print('Load model failed!')
+        exit(ret)
+    print('done')
+
+    # Build model
+    print('--> Building model')
+    ret = rknn.build(do_quantization=do_quant, dataset=DATASET_PATH)
+    if ret != 0:
+        print('Build model failed!')
+        exit(ret)
+    print('done')
+
+    # Export rknn model
+    print('--> Export rknn model')
+    ret = rknn.export_rknn(output_path)
+    if ret != 0:
+        print('Export rknn model failed!')
+        exit(ret)
+    print('done')
+
+    # Release
+    rknn.release()
\ No newline at end of file
diff --git a/examples/RetinaFace/python/result.jpg b/examples/RetinaFace/python/result.jpg
index 441627d..b9361ea 100644
Binary files a/examples/RetinaFace/python/result.jpg and b/examples/RetinaFace/python/result.jpg differ
diff --git a/examples/deeplabv3/README.md b/examples/deeplabv3/README.md
index bfb87d4..6d1da0e 100644
--- a/examples/deeplabv3/README.md
+++ b/examples/deeplabv3/README.md
@@ -12,22 +12,38 @@ cd model
 ./download_model.sh
 ```
 
-## Python Script Usage
+## Convert to RKNN
+
+*Usage:*
+
+```shell
+cd python
+python convert.py <tensorFlow_model> <TARGET_PLATFORM> <dtype(optional)> <output_rknn_path(optional)>
+
+# such as: python convert.py ../model/deeplab-v3-plus-mobilenet-v2.pb rk3588
+# output model will be saved as ../model/deeplab-v3-plus-mobilenet-v2.rknn
+```
+
+*Description:*
+- <tensorFlow_model> should be the TensorFlow model path.
+- <TARGET_PLATFORM>  could be specified as RK3562, RK3566, RK3568, RK3588, RK1808, RV1109, RV1126 according to board SOC version.
+- <dtype\> is *optional*, could be specified as `i8`, `u8` or `fp`, `i8`/`u8` means to do quantization, `fp` means no to do quantization, default is `i8`/`u8`.
+- <output_rknn_path> is *optional*, used to specify the saving path of the RKNN model, default save in the same directory as TensorFlow model with name `deeplab-v3-plus-mobilenet-v2.rknn`.
+
+
+## Python Demo
 *Usage:*
 
 ```shell
 cd python
-python deeplabv3.py <pb_model> <TARGET_PLATFORM> <dtype(optional)> <output_rknn_path(optional)> <plot/save(optional)>
+python deeplabv3.py --model_path <rknn_model> --target <TARGET_PLATFORM>
 # such as: python deeplabv3.py ../model/deeplab-v3-plus-mobilenet-v2.pb rk3566
 # output model will be saved as ../model/deeplab-v3-plus-mobilenet-v2.rknn
 ```
 *Description:*
+- <TARGET_PLATFORM>: Specified as the NPU platform name. Such as 'rk3588'.
+- <rknn_model>: Specified as the model path.
 
-- <pb_model> should be the model path.
-- <TARGET_PLATFORM> could be specified as RK3562, RK3566, RK3568, RK3588 according to board SOC version.
-- <dtype\> is *optional*, could be specified as `i8` or `fp`, `i8` means to do quantization, `fp` means no to do quantization, default is `i8`.
-- <output_rknn_path> is *optional*, used to specify the saving path of the RKNN model, default save path is `../model/deeplab-v3-plus-mobilenet-v2.rknn`
-- <plot/save> is *optional*, `plot` means to display the segmentation results on the screen (without automatically saving the segmentation results), and `save` means to directly save the segmentation results. The segmentation results are displayed by default.
 
 **Note**: Due to post-processing for reszie and argmax, the model needs to be cropped to run on the C demo. This is shown below by python scripts.
 
@@ -40,7 +56,7 @@ rknn.load_tensorflow(ori_model,
 
 Where `logits/semantic/BiasAdd` are selected as output node for deeplabv3 model rather than the original model output node.
 
-## Expected Results
+*Expected Results*
 
 This example will print the segmentation result on the testing image, as follows:
 
@@ -49,26 +65,12 @@ This example will print the segmentation result on the testing image, as follows
 
 
 ## Android Demo
+**Note: RK1808, RV1109, RV1126 does not support Android.**
 
 ### Compiling && Building
 
-Modify the path of Android NDK in '[build-android.sh](../../build-android.sh)'.
-
-For example,
-
-```sh
-ANDROID_NDK_PATH=~/opt/toolchain/android-ndk-r19c
-```
-
-Then, run this script:
-
-```sh
-# go back to the rknn_model_zoo root directory
-# cd <rknn_model_zoo_path>
-./build-android.sh -t <TARGET_PLATFORM> -a arm64-v8a -d deeplabv3
-```
-
-Please use the specific platform instead of <TARGET_PLATFORM> above.
+Please refer to the [Compilation_Environment_Setup_Guide](../../docs/Compilation_Environment_Setup_Guide.md#android-platform) document to setup a cross-compilation environment and complete the compilation of C/C++ Demo.
+**Note: Please replace the model name with `deeplabv3`.**
 
 ### Push all build output file to the board
 
@@ -107,19 +109,9 @@ Note: The segmentation results will be saved in the `out.png`.
 
 ### Compiling && Building
 
-According to the target platform, modify the path of 'GCC_COMPILER' in 'build-linux.sh'.
+Please refer to the [Compilation_Environment_Setup_Guide](../../docs/Compilation_Environment_Setup_Guide.md#linux-platform) document to setup a cross-compilation environment and complete the compilation of C/C++ Demo.
+**Note: Please replace the model name with `deeplabv3`.**
 
-```sh
-export GCC_COMPILER=/opt/tools/prebuilts/gcc/linux-x86/aarch64/gcc-linaro-6.3.1-2017.05-x86_64_aarch64-linux-gnu/bin/aarch64-linux-gnu
-```
-
-Then, run the script:
-
-```sh
-./build-linux.sh  -t <TARGET_PLATFORM> -a aarch64 -d deeplabv3
-```
-
-Please use the specific platform instead of <TARGET_PLATFORM> above.
 
 ### Push all build output file to the board
 
@@ -146,7 +138,7 @@ export LD_LIBRARY_PATH=./lib
 ./rknn_deeplabv3_demo model/deeplab-v3-plus-mobilenet-v2.rknn model/test_image.jpg
 ```
 
-**For error message: can't find libOpenCL.so **
+**For error message: can't find libOpenCL.so**
 
 This error can be fixed by creating a soft link to ARM mali library **/usr/lib/aarch64-linux-gnu/libmali.so.1.9.0**
 
@@ -157,32 +149,19 @@ ln -s /usr/lib/aarch64-linux-gnu/libmali.so.1.9.0 libOpenCL.so
 Then copy libOpenCL.so to path of your lib in demo and run it again.   
 Note: **If the libmali.so cannot be found on the path of /usr/lib, please try to search this library in whole system or upgrade firmware to make sure there is a file named libmali.so on the board.**
 
+Note: **RK1808, RV1109, and RV1126 platforms do not have a GPU. The CPP demo under the `cpp/rknpu1` folder do not use GPU implementation, so there won't be an issue of not finding libOpenCL.so.**
 
+## Notes
 
-## How to Modify C demo
-
-### Rquirements
-
-1. **The deeplabv3 C demo under the cpp folder only supports platforms of RK356x and RK3588 with GPU unit to run the postprocessing.**
-
-**Note: if user wants to use other deeplabv3 model with differrnt size, please modify the following variables. **
+**if user wants to use other deeplabv3 model with differrnt size, please modify the following variables.**
 
 ```C++
-// Modify it accroding to model io input
-constexpr size_t OUT_SIZE = 65;  
-constexpr size_t MASK_SIZE = 513;  
-constexpr size_t NUM_LABEL = 21;  
+// Modify it accroding to required label for your model
+const size_t NUM_LABEL = 21;  
 ```
 
-Where the 'OUT_SIZE' refers to output size of your deeplabv3 model, 'MASK_SIZE' is the same as input size of your model.
-
-**In addition to these variables, the cl kernel needs to be modified as shown below**.
-
-```c
-#define SRC_STRIDE 1365 //65*21 
-```
+Also. if user need to have different color table for converting model output to RGB or other color space, the FULL_COLOR_MAP should be modified according to your color table used in the post-processing. 
 
-**This variable must be modified according to your model output size multiplied with the number of label. It locates in file kernel_upsampleSoftmax.h in path_to_your_rknn_model_zoo/examples/Deeplabv3/cpp/gpu_postprocess/cl_kernels.**
 
 ## Expected Results
 
diff --git a/examples/deeplabv3/cpp/CMakeLists.txt b/examples/deeplabv3/cpp/CMakeLists.txt
index 67026a2..07170a4 100644
--- a/examples/deeplabv3/cpp/CMakeLists.txt
+++ b/examples/deeplabv3/cpp/CMakeLists.txt
@@ -16,23 +16,62 @@ add_subdirectory(${CMAKE_CURRENT_SOURCE_DIR}/../../../3rdparty/ 3rdparty.out)
 add_subdirectory(${CMAKE_CURRENT_SOURCE_DIR}/../../../utils/ utils.out)
 add_subdirectory(gpu_postprocess)
 
+#opencv
+if(CMAKE_SIZEOF_VOID_P EQUAL 8)
+message(STATUS "64bit")
+set (TARGET_LIB_ARCH lib64)
+else()
+message(STATUS "32bit")
+set (TARGET_LIB_ARCH lib)
+endif()
+if (CMAKE_SYSTEM_NAME STREQUAL "Android")
+  set(OpenCV_DIR ${CMAKE_CURRENT_SOURCE_DIR}/../../../3rdparty/opencv/opencv-android-sdk-build/sdk/native/jni/abi-${CMAKE_ANDROID_ARCH_ABI})
+else()
+    if(TARGET_LIB_ARCH STREQUAL "lib")
+      set(OpenCV_DIR ${CMAKE_CURRENT_SOURCE_DIR}/../../../3rdparty/opencv/opencv-linux-armhf/share/OpenCV)
+    else()
+      set(OpenCV_DIR ${CMAKE_CURRENT_SOURCE_DIR}/../../../3rdparty/opencv/opencv-linux-aarch64/share/OpenCV)
+    endif()
+endif()
+find_package(OpenCV REQUIRED)
+message(STATUS OpenCV_DIR=${OpenCV_DIR})
+message(STATUS OpenCV_LIBS=${OpenCV_LIBS})
+
 set(CMAKE_INSTALL_RPATH "$ORIGIN/../lib")
 
+if (TARGET_SOC STREQUAL "rk1808" OR TARGET_SOC STREQUAL "rv1109" OR TARGET_SOC STREQUAL "rv1126")
+    set(deeplabv3_file rknpu1/deeplabv3.cc)
+else()
+    set(deeplabv3_file rknpu2/deeplabv3.cc)
+endif()
+
 file(GLOB SRCS ${CMAKE_CURRENT_SOURCE_DIR}/*.cc)
 
 add_executable(${PROJECT_NAME}
     main.cc
-    rknpu2/deeplabv3.cc
+    ${deeplabv3_file}
 )
 
+if (TARGET_SOC STREQUAL "rk1808" OR TARGET_SOC STREQUAL "rv1109" OR TARGET_SOC STREQUAL "rv1126")
+    target_link_libraries(${PROJECT_NAME}
+        fileutils
+        imageutils
+        imagedrawing
+        ${OpenCV_LIBS} 
+        ${LIBRKNNRT}
+        dl
+    )
+else()
+    target_link_libraries(${PROJECT_NAME}
+        fileutils
+        imageutils
+        imagedrawing
+        ${OpenCV_LIBS} 
+        ${LIBRKNNRT}
+        gpu_postprocess
+    )
+endif()
 
-target_link_libraries(${PROJECT_NAME}
-    fileutils
-    imageutils
-    imagedrawing
-    ${LIBRKNNRT}
-    gpu_postprocess
-)
 
 if (CMAKE_SYSTEM_NAME STREQUAL "Android")
     target_link_libraries(${PROJECT_NAME} log)
diff --git a/examples/deeplabv3/cpp/deeplabv3.h b/examples/deeplabv3/cpp/deeplabv3.h
index eeda2d2..ab22a65 100644
--- a/examples/deeplabv3/cpp/deeplabv3.h
+++ b/examples/deeplabv3/cpp/deeplabv3.h
@@ -4,10 +4,6 @@
 #include "rknn_api.h"
 #include "common.h"
 
-//mod it accroding to model io input
-constexpr size_t OUT_SIZE = 65;  
-constexpr size_t MASK_SIZE = 513;  
-constexpr size_t NUM_LABEL = 21;  
 
 
 typedef struct {
@@ -20,14 +16,10 @@ typedef struct {
     int model_height;
 } rknn_app_context_t;
 
-typedef struct {
-    unsigned char out_mask[MASK_SIZE*MASK_SIZE];
-} deeplabv3_result;
-
 int init_deeplabv3_model(const char* model_path, rknn_app_context_t* app_ctx);
 
 int release_deeplabv3_model(rknn_app_context_t* app_ctx);
 
-int inference_deeplabv3_model(rknn_app_context_t* app_ctx, image_buffer_t* img, deeplabv3_result* out_result);
+int inference_deeplabv3_model(rknn_app_context_t* app_ctx, image_buffer_t* img);
 
 #endif //_RKNN_DEMO_DEEPLABV3_H_
\ No newline at end of file
diff --git a/examples/deeplabv3/cpp/gpu_postprocess/cl_kernels/kernel_upsampleSoftmax.h b/examples/deeplabv3/cpp/gpu_postprocess/cl_kernels/kernel_upsampleSoftmax.h
index 9a40747..95bebe3 100644
--- a/examples/deeplabv3/cpp/gpu_postprocess/cl_kernels/kernel_upsampleSoftmax.h
+++ b/examples/deeplabv3/cpp/gpu_postprocess/cl_kernels/kernel_upsampleSoftmax.h
@@ -1,14 +1,14 @@
 #ifndef _RKNN_DEMO_DEEPLABV3_KERNEL_H_
 #define _RKNN_DEMO_DEEPLABV3_KERNEL_H_
 
+//"#define SRC_STRIDE 1365 //65*21\n" 
 const char* CL_kernel_string_src = 
 
 "#define INC(x,l) min(x+1,l-1)\n"
-"#define SRC_STRIDE 1365 //65*21\n" 
 "__kernel void UpsampleSoftmax(__global const float* src_buf, __global uchar* index_buf, "
 "                              const int srcHeight, const int srcWidth, "
 "                              const int dstHeight, const int dstWidth, const int dstChannel,"
-"                              const float scale_h_inv, const float scale_w_inv, const int img_array_size) {\n"
+"                              const float scale_h_inv, const float scale_w_inv, const int SRC_STRIDE) {\n"
 "  int dx = get_global_id(0);//dst width \n"
 "  int dy = get_global_id(1);//dst height \n"
 "  float sx = ((dx) * scale_w_inv), sy = ((dy) * scale_h_inv); //resize\n"
diff --git a/examples/deeplabv3/cpp/gpu_postprocess/gpu_compose_impl.cc b/examples/deeplabv3/cpp/gpu_postprocess/gpu_compose_impl.cc
index b395f20..724772b 100644
--- a/examples/deeplabv3/cpp/gpu_postprocess/gpu_compose_impl.cc
+++ b/examples/deeplabv3/cpp/gpu_postprocess/gpu_compose_impl.cc
@@ -119,7 +119,7 @@ namespace gpu_postprocess
                                 std::string src_name, float *src_img_ptr, std::string res_name, unsigned char *res, 
                                 const int srcHeight, const int srcWidth,
                                 const int dstHeight, const int dstWidth, const int dstChannel, 
-                                const float scale_h_inv, const float scale_w_inv, const int img_arr_size) {
+                                const float scale_h_inv, const float scale_w_inv, const int src_stride) {
         if (workspace_->kernel_maps.find(kernel_name) == workspace_->kernel_maps.end())
         {
             LOGE("kernel: '%s' kernel not found\n", kernel_name.c_str());
@@ -150,7 +150,7 @@ namespace gpu_postprocess
         OPENCL_CALL(clSetKernelArg(kernel, arg_idx++, sizeof(int), &dstChannel));
         OPENCL_CALL(clSetKernelArg(kernel, arg_idx++, sizeof(float), &scale_h_inv));
         OPENCL_CALL(clSetKernelArg(kernel, arg_idx++, sizeof(float), &scale_w_inv));
-        OPENCL_CALL(clSetKernelArg(kernel, arg_idx++, sizeof(int), &img_arr_size));
+        OPENCL_CALL(clSetKernelArg(kernel, arg_idx++, sizeof(int), &src_stride));
 
 
         // LOGI("dst_height = %d, dst_width = %d, dstChannel = %d, srcHeight = %d, srcWidth = %d, scale_h_inv = %f, scale_w_inv = %f, wf->global_ws_len_ = %d,  wf->global_ws_[0] = %d,  wf->global_ws_[1] = %d\n",
diff --git a/examples/deeplabv3/cpp/gpu_postprocess/gpu_compose_impl.h b/examples/deeplabv3/cpp/gpu_postprocess/gpu_compose_impl.h
index 937f833..2d96a89 100644
--- a/examples/deeplabv3/cpp/gpu_postprocess/gpu_compose_impl.h
+++ b/examples/deeplabv3/cpp/gpu_postprocess/gpu_compose_impl.h
@@ -61,7 +61,7 @@ namespace gpu_postprocess
                             std::string src_name, float *src_img_ptr, std::string res_name, unsigned char *res,
                             const int srcHeight, const int srcWidth,
                             const int dstHeight, const int dstWidth, const int dstChannel,
-                            const float scale_h_inv, const float scale_w_inv, const int img_arr_size);
+                            const float scale_h_inv, const float scale_w_inv, const int src_stride);
         int UpsampleSoftmaxImage2D(std::string kernel_name,
                                    std::string src_name, float *src_img_ptr, std::string res_name, unsigned char *res,
                                    const int dstHeight, const int dstWidth, const int dstChannel, const int img_array_size);
diff --git a/examples/deeplabv3/cpp/main.cc b/examples/deeplabv3/cpp/main.cc
index c1702a5..89ece0f 100644
--- a/examples/deeplabv3/cpp/main.cc
+++ b/examples/deeplabv3/cpp/main.cc
@@ -56,9 +56,7 @@ int main(int argc, char** argv)
         return -1;
     }
 
-    deeplabv3_result results{};
-
-    ret = inference_deeplabv3_model(&rknn_app_ctx, &src_image, &results);
+    ret = inference_deeplabv3_model(&rknn_app_ctx, &src_image);
     if (ret != 0) {
         printf("init_deeplabv3_model fail! ret=%d\n", ret);
         goto out;
@@ -67,7 +65,6 @@ int main(int argc, char** argv)
     //show image
     write_image("out.png", &src_image);
 
-
 out:
     ret = release_deeplabv3_model(&rknn_app_ctx);
     if (ret != 0) {
diff --git a/examples/deeplabv3/cpp/rknpu1/deeplabv3.cc b/examples/deeplabv3/cpp/rknpu1/deeplabv3.cc
new file mode 100644
index 0000000..3f2bc13
--- /dev/null
+++ b/examples/deeplabv3/cpp/rknpu1/deeplabv3.cc
@@ -0,0 +1,321 @@
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+#include <math.h>
+#include <memory>
+#include "deeplabv3.h"
+#include "common.h"
+#include "file_utils.h"
+#include "image_utils.h"
+#include <opencv2/opencv.hpp>
+
+#define NUM_LABEL 21
+
+static int Dump_bin_to_file(void *pBuffer, const char *fileName, const size_t sz_data)
+{
+
+    FILE *pFile = fopen(fileName, "wb");
+    if (pFile == NULL)
+    {
+        puts("error in outputing files.");
+        return -1;
+    }
+
+    fwrite(pBuffer, 1, sz_data, pFile);
+    fflush(pFile);
+
+    if (fclose(pFile) != 0)
+    {
+        puts("Error in closing files.");
+        return -1;
+    }
+
+    return 0;
+}
+
+static constexpr int FULL_COLOR_MAP[NUM_LABEL][3] = {
+    {0, 0, 0},
+    {128, 0, 0},
+    {0, 128, 0},
+    {128, 128, 0},
+    {0, 0, 128},
+    {128, 0, 128},
+    {0, 128, 128},
+    {128, 128, 128},
+    {64, 0, 0},
+    {192, 0, 0},
+    {64, 128, 0},
+    {192, 128, 0},
+    {64, 0, 128},
+    {192, 0, 128},
+    {64, 128, 128},
+    {192, 128, 128},
+    {0, 64, 0},
+    {128, 64, 0},
+    {0, 192, 0},
+    {128, 192, 0},
+    {0, 64, 128}
+
+};
+
+static void dump_tensor_attr(rknn_tensor_attr *attr)
+{
+    printf("  index=%d, name=%s, n_dims=%d, dims=[%d, %d, %d, %d], n_elems=%d, size=%d, fmt=%s, type=%s, qnt_type=%s, "
+           "zp=%d, scale=%f\n",
+           attr->index, attr->name, attr->n_dims, attr->dims[3], attr->dims[2], attr->dims[1], attr->dims[0],
+           attr->n_elems, attr->size, get_format_string(attr->fmt), get_type_string(attr->type),
+           get_qnt_type_string(attr->qnt_type), attr->zp, attr->scale);
+}
+
+static void resize_by_opencv(float *input_image, int input_width, int input_height, float *output_image, int target_width, int target_height)
+{
+    cv::Mat src_image(input_height, input_width, CV_MAKETYPE(CV_32F, NUM_LABEL), input_image);
+    cv::Mat dst_image(target_height, target_width, CV_MAKETYPE(CV_32F, NUM_LABEL), output_image);
+    cv::resize(src_image, dst_image, cv::Size(target_width, target_height), 0, 0, cv::INTER_LINEAR);
+    memcpy(output_image, dst_image.data, target_width * target_height * NUM_LABEL * sizeof(float));
+}
+
+static void compose_img(uint8_t *res_buf, uint8_t *img_buf, int height, int width)
+{
+    // blending two images
+    // using 0 gamma, 0.5 a
+    const float alpha = 0.5f;
+    float beta = 1.0 - alpha;
+
+    for (int h = 0; h < height; ++h)
+    {
+        for (int w = 0; w < width; ++w)
+        {
+            unsigned char map_label = res_buf[h * width + w];
+
+            auto ori_pixel_r = img_buf[h * width * 3 + w * 3];
+            auto ori_pixel_g = img_buf[h * width * 3 + w * 3 + 1];
+            auto ori_pixel_b = img_buf[h * width * 3 + w * 3 + 2];
+
+            img_buf[h * width * 3 + w * 3] = FULL_COLOR_MAP[map_label][0] * alpha + ori_pixel_r * beta;
+            img_buf[h * width * 3 + w * 3 + 1] = FULL_COLOR_MAP[map_label][1] * alpha + ori_pixel_g * beta; // g
+            img_buf[h * width * 3 + w * 3 + 2] = FULL_COLOR_MAP[map_label][2] * alpha + ori_pixel_b * beta; // b
+        }
+    }
+}
+
+int init_deeplabv3_model(const char *model_path, rknn_app_context_t *app_ctx)
+{
+    using namespace std;
+
+    int ret;
+    int model_len = 0;
+    char *model;
+    rknn_context ctx = 0;
+
+    // Load RKNN Model
+    model_len = read_data_from_file(model_path, &model);
+    if (model == NULL)
+    {
+        printf("load_model fail!\n");
+        return -1;
+    }
+
+    ret = rknn_init(&ctx, model, model_len, 0);
+    free(model);
+    if (ret < 0)
+    {
+        printf("rknn_init fail! ret=%d\n", ret);
+        return -1;
+    }
+
+    // Get Model Input Output Number
+    rknn_input_output_num io_num;
+    ret = rknn_query(ctx, RKNN_QUERY_IN_OUT_NUM, &io_num, sizeof(io_num));
+    if (ret != RKNN_SUCC)
+    {
+        printf("rknn_query fail! ret=%d\n", ret);
+        return -1;
+    }
+    printf("model input num: %d, output num: %d\n", io_num.n_input, io_num.n_output);
+
+    // Get Model Input Info
+    printf("input tensors:\n");
+    rknn_tensor_attr input_attrs[io_num.n_input];
+    memset(input_attrs, 0, sizeof(input_attrs));
+    for (int i = 0; i < io_num.n_input; i++)
+    {
+        input_attrs[i].index = i;
+        ret = rknn_query(ctx, RKNN_QUERY_INPUT_ATTR, &(input_attrs[i]), sizeof(rknn_tensor_attr));
+        if (ret != RKNN_SUCC)
+        {
+            printf("rknn_query fail! ret=%d\n", ret);
+            return -1;
+        }
+        dump_tensor_attr(&(input_attrs[i]));
+    }
+
+    // Get Model Output Info
+    printf("output tensors:\n");
+    rknn_tensor_attr output_attrs[io_num.n_output];
+    memset(output_attrs, 0, sizeof(output_attrs));
+    for (int i = 0; i < io_num.n_output; i++)
+    {
+        output_attrs[i].index = i;
+        ret = rknn_query(ctx, RKNN_QUERY_OUTPUT_ATTR, &(output_attrs[i]), sizeof(rknn_tensor_attr));
+        if (ret != RKNN_SUCC)
+        {
+            printf("rknn_query fail! ret=%d\n", ret);
+            return -1;
+        }
+        dump_tensor_attr(&(output_attrs[i]));
+    }
+
+    // Set to context
+    app_ctx->rknn_ctx = ctx;
+    app_ctx->io_num = io_num;
+    app_ctx->input_attrs = (rknn_tensor_attr *)malloc(io_num.n_input * sizeof(rknn_tensor_attr));
+    memcpy(app_ctx->input_attrs, input_attrs, io_num.n_input * sizeof(rknn_tensor_attr));
+    app_ctx->output_attrs = (rknn_tensor_attr *)malloc(io_num.n_output * sizeof(rknn_tensor_attr));
+    memcpy(app_ctx->output_attrs, output_attrs, io_num.n_output * sizeof(rknn_tensor_attr));
+
+    if (input_attrs[0].fmt == RKNN_TENSOR_NCHW)
+    {
+        printf("model is NCHW input fmt\n");
+        app_ctx->model_channel = input_attrs[0].dims[2];
+        app_ctx->model_height = input_attrs[0].dims[1];
+        app_ctx->model_width = input_attrs[0].dims[0];
+    }
+    else
+    {
+        printf("model is NHWC input fmt\n");
+        app_ctx->model_height = input_attrs[0].dims[2];
+        app_ctx->model_width = input_attrs[0].dims[1];
+        app_ctx->model_channel = input_attrs[0].dims[0];
+    }
+    printf("model input height=%d, width=%d, channel=%d\n",
+           app_ctx->model_height, app_ctx->model_width, app_ctx->model_channel);
+
+    return 0;
+}
+
+int release_deeplabv3_model(rknn_app_context_t *app_ctx)
+{
+    if (app_ctx->input_attrs != NULL)
+    {
+        free(app_ctx->input_attrs);
+        app_ctx->input_attrs = NULL;
+    }
+    if (app_ctx->output_attrs != NULL)
+    {
+        free(app_ctx->output_attrs);
+        app_ctx->output_attrs = NULL;
+    }
+    if (app_ctx->rknn_ctx != 0)
+    {
+        rknn_destroy(app_ctx->rknn_ctx);
+        app_ctx->rknn_ctx = 0;
+    }
+    return 0;
+}
+
+static void post_process(float *input, uint8_t *output, int seg_width, int seg_height, int n_label, int out_width, int out_height)
+{
+    float *mask = (float *)malloc(out_width * out_height * n_label * sizeof(float));
+    resize_by_opencv(input, seg_width, seg_height, mask, out_width, out_height);
+
+    // Find the index of the maximum value along the last axis
+    int max_index;
+    for (int i = 0; i < out_height * out_width; i++)
+    {
+        max_index = 0;
+        for (int j = 1; j < n_label; j++)
+        {
+            if (mask[i * n_label + j] > mask[i * n_label + max_index])
+            {
+                max_index = j;
+            }
+        }
+        output[i] = max_index;
+    }
+
+    free(mask);
+}
+
+int inference_deeplabv3_model(rknn_app_context_t *app_ctx, image_buffer_t *src_img)
+{
+    int ret;
+    image_buffer_t img;
+    rknn_input inputs[1];
+    rknn_output outputs[1];
+
+    memset(&img, 0, sizeof(image_buffer_t));
+    memset(inputs, 0, sizeof(inputs));
+    memset(outputs, 0, sizeof(outputs));
+
+    // Pre Process
+    img.width = app_ctx->model_width;
+    img.height = app_ctx->model_height;
+    img.format = IMAGE_FORMAT_RGB888;
+    img.size = get_image_size(&img);
+    img.virt_addr = (unsigned char *)malloc(img.size);
+    uint8_t *seg_img = (uint8_t *)malloc(img.width * img.height * sizeof(uint8_t));
+    if (img.virt_addr == NULL)
+    {
+        printf("malloc buffer size:%d fail!\n", img.size);
+        return -1;
+    }
+
+    ret = convert_image(src_img, &img, NULL, NULL, 0);
+    if (ret < 0)
+    {
+        printf("convert_image fail! ret=%d\n", ret);
+        return -1;
+    }
+
+    // Set Input Data
+    inputs[0].index = 0;
+    inputs[0].type = RKNN_TENSOR_UINT8;
+    inputs[0].fmt = RKNN_TENSOR_NHWC;
+    inputs[0].size = app_ctx->model_width * app_ctx->model_height * app_ctx->model_channel;
+    inputs[0].buf = img.virt_addr;
+
+    ret = rknn_inputs_set(app_ctx->rknn_ctx, 1, inputs);
+    if (ret < 0)
+    {
+        printf("rknn_input_set fail! ret=%d\n", ret);
+        return -1;
+    }
+
+    // Run
+    printf("rknn_run\n");
+    ret = rknn_run(app_ctx->rknn_ctx, nullptr);
+    if (ret < 0)
+    {
+        printf("rknn_run fail! ret=%d\n", ret);
+        return -1;
+    }
+
+    // Get Output
+    outputs[0].want_float = 1;
+    ret = rknn_outputs_get(app_ctx->rknn_ctx, 1, outputs, NULL);
+    if (ret < 0)
+    {
+        printf("rknn_outputs_get fail! ret=%d\n", ret);
+        goto out;
+    }
+
+    // Post Process
+    post_process((float *)outputs[0].buf, seg_img, app_ctx->output_attrs[0].dims[2], app_ctx->output_attrs[0].dims[1], app_ctx->output_attrs[0].dims[0],
+                 img.width, img.height);
+
+    // draw mask
+    compose_img(seg_img, src_img->virt_addr, src_img->height, src_img->width);
+    free(seg_img);
+
+    // Remeber to release rknn output
+    rknn_outputs_release(app_ctx->rknn_ctx, 1, outputs);
+
+out:
+    if (img.virt_addr != NULL)
+    {
+        free(img.virt_addr);
+    }
+
+    return ret;
+}
\ No newline at end of file
diff --git a/examples/deeplabv3/cpp/rknpu2/deeplabv3.cc b/examples/deeplabv3/cpp/rknpu2/deeplabv3.cc
index 8550815..b9c79fa 100644
--- a/examples/deeplabv3/cpp/rknpu2/deeplabv3.cc
+++ b/examples/deeplabv3/cpp/rknpu2/deeplabv3.cc
@@ -13,6 +13,8 @@
 #include "gpu_compose_impl.h"
 #include "cl_kernels/kernel_upsampleSoftmax.h"
 
+
+
 using namespace gpu_postprocess;
 
 namespace {
@@ -20,6 +22,7 @@ namespace {
     const constexpr char UP_SOFTMAX_IN0[] =  "UP_SOFTMAX_IN";
     const constexpr char UP_SOFTMAX_OUT0[] =  "UP_SOFTMAX_OUT";
 
+    const size_t NUM_LABEL = 21;
     std::shared_ptr<gpu_compose_impl> Gpu_Impl = nullptr;
     
 }  
@@ -46,55 +49,51 @@ static int Dump_bin_to_file(void *pBuffer, const char *fileName, const size_t sz
     return 0;
 }
 
+static constexpr int FULL_COLOR_MAP[NUM_LABEL][3] = {
+    {0, 0, 0},
 
-static constexpr int FULL_COLOR_MAP[NUM_LABEL][3]= {
-  {  0  ,0 ,  0},
-
-  {128, 0 ,0},
+    {128, 0, 0},
 
-  { 0, 128, 0},
+    {0, 128, 0},
 
-  {128, 128, 0},
+    {128, 128, 0},
 
- {  0 , 0, 128},
+    {0, 0, 128},
 
- {128, 0, 128},
+    {128, 0, 128},
 
- {0, 128 , 128},
+    {0, 128, 128},
 
- {128, 128, 128},
+    {128, 128, 128},
 
- { 64  ,0  ,0},
+    {64, 0, 0},
 
- {192,   0  ,0},
+    {192, 0, 0},
 
- { 64, 128  , 0},
+    {64, 128, 0},
 
- {192, 128 , 0},
+    {192, 128, 0},
 
- {64  ,0  , 128},
+    {64, 0, 128},
 
- {192,  0 , 128},
+    {192, 0, 128},
 
- { 64 ,128 ,128},
+    {64, 128, 128},
 
- {192, 128, 128},
+    {192, 128, 128},
 
- {  0 , 64 , 0},
+    {0, 64, 0},
 
- {128 , 64  , 0},
+    {128, 64, 0},
 
- { 0, 192, 0},
+    {0, 192, 0},
 
- {128, 192  ,0},
+    {128, 192, 0},
 
-{0, 64, 128}
+    {0, 64, 128}
 
 };
 
-
-
-
 static void dump_tensor_attr(rknn_tensor_attr* attr)
 {
     printf("  index=%d, name=%s, n_dims=%d, dims=[%d, %d, %d, %d], n_elems=%d, size=%d, fmt=%s, type=%s, qnt_type=%s, "
@@ -285,11 +284,6 @@ int init_deeplabv3_model(const char* model_path, rknn_app_context_t* app_ctx)
 
 int release_deeplabv3_model(rknn_app_context_t* app_ctx)
 {
-
-    if (app_ctx->rknn_ctx != 0) {
-        rknn_destroy(app_ctx->rknn_ctx);
-        app_ctx->rknn_ctx = 0;
-    }
     if (app_ctx->input_attrs != NULL) {
         free(app_ctx->input_attrs);
         app_ctx->input_attrs = NULL;
@@ -298,10 +292,14 @@ int release_deeplabv3_model(rknn_app_context_t* app_ctx)
         free(app_ctx->output_attrs);
         app_ctx->output_attrs = NULL;
     }
+    if (app_ctx->rknn_ctx != 0) {
+        rknn_destroy(app_ctx->rknn_ctx);
+        app_ctx->rknn_ctx = 0;
+    }
     return 0;
 }
 
-int inference_deeplabv3_model(rknn_app_context_t* app_ctx, image_buffer_t* src_img, deeplabv3_result* out_result)
+int inference_deeplabv3_model(rknn_app_context_t* app_ctx, image_buffer_t* src_img)
 {
     using namespace std;
 
@@ -309,6 +307,21 @@ int inference_deeplabv3_model(rknn_app_context_t* app_ctx, image_buffer_t* src_i
     image_buffer_t img;
 
     memset(&img, 0, sizeof(image_buffer_t));
+    //fetch model IO info according to NHWC layout !!!
+    //OUT_SIZE is only for square output size 
+    size_t OUT_SIZE=0;
+    size_t MASK_SIZE=0;
+
+    if (app_ctx->input_attrs[0].fmt == RKNN_TENSOR_NCHW) {
+        printf("model is NCHW input fmt\n");
+        OUT_SIZE = app_ctx->output_attrs[0].dims[2]; //65  
+        MASK_SIZE = app_ctx->input_attrs[0].dims[3]; //513  
+    }
+    else {
+        printf("model is NHWC input fmt\n");
+        OUT_SIZE = app_ctx->output_attrs[0].dims[2]; //65  
+        MASK_SIZE = app_ctx->input_attrs[0].dims[2]; //513  
+    }
 
     // Pre Process
     img.width = app_ctx->model_width;
@@ -392,8 +405,6 @@ int inference_deeplabv3_model(rknn_app_context_t* app_ctx, image_buffer_t* src_i
             return -1;
         }
     }
-
-    rknn_set_core_mask(app_ctx->rknn_ctx, RKNN_NPU_CORE_0_1_2);
     
     rknn_tensor_mem *post_buf_mem[1];
     post_buf_mem[0] = rknn_create_mem(app_ctx->rknn_ctx, MASK_SIZE * MASK_SIZE);
@@ -427,14 +438,14 @@ int inference_deeplabv3_model(rknn_app_context_t* app_ctx, image_buffer_t* src_i
         return -1;
     }
 
+    const auto SRC_STRIDE = OUT_SIZE * NUM_LABEL;
+
     // Post Process
     Gpu_Impl->UpsampleSoftmax(UPSAMPLE_SOFTMAX_KERNEL_NAME, UP_SOFTMAX_IN0, nullptr,
-                           UP_SOFTMAX_OUT0, nullptr, OUT_SIZE, OUT_SIZE, MASK_SIZE, MASK_SIZE, NUM_LABEL, scale_h_inv, scale_w_inv, NUM_LABEL);
+                           UP_SOFTMAX_OUT0, nullptr, OUT_SIZE, OUT_SIZE, MASK_SIZE, MASK_SIZE, NUM_LABEL, scale_h_inv, scale_w_inv, SRC_STRIDE);
     
     compose_img((unsigned char *)post_buf_mem[0]->virt_addr, src_img->virt_addr, MASK_SIZE, MASK_SIZE);
 
-    memcpy(out_result->out_mask, post_buf_mem[0]->virt_addr, MASK_SIZE*MASK_SIZE*sizeof(uint8_t));
-
     //For debugging purpose
     //Dump_bin_to_file(out_result->img, "test_img_out.bin", MASK_SIZE*MASK_SIZE*3*sizeof(uint8_t));
 
diff --git a/examples/deeplabv3/python/convert.py b/examples/deeplabv3/python/convert.py
new file mode 100644
index 0000000..9b806d4
--- /dev/null
+++ b/examples/deeplabv3/python/convert.py
@@ -0,0 +1,76 @@
+import sys
+from rknn.api import RKNN
+
+DATASET_PATH = '../model/dataset.txt'
+DEFAULT_RKNN_PATH = '../model/deeplab-v3-plus-mobilenet-v2.rknn'
+DEFAULT_QUANT = True
+
+def parse_arg():
+    if len(sys.argv) < 3:
+        print("Usage: python3 {} pb_model_path [platform] [dtype(optional)] [output_rknn_path(optional)]".format(sys.argv[0]));
+        print("       platform choose from [rk3562, rk3566, rk3568, rk3588, rk1808, rv1109, rv1126]")
+        print("       dtype choose from    [i8, fp] for [rk3562,rk3566,rk3568,rk3588]")
+        print("       dtype choose from    [u8, fp] for [rk1808,rv1109,rv1126]")
+        exit(1)
+
+    model_path = sys.argv[1]
+    platform = sys.argv[2]
+
+    do_quant = DEFAULT_QUANT
+    if len(sys.argv) > 3:
+        model_type = sys.argv[3]
+        if model_type not in ['i8', 'u8', 'fp']:
+            print("ERROR: Invalid model type: {}".format(model_type))
+            exit(1)
+        elif model_type in ['i8', 'u8']:
+            do_quant = True
+        else:
+            do_quant = False
+
+    if len(sys.argv) > 4:
+        output_path = sys.argv[4]
+    else:
+        output_path = DEFAULT_RKNN_PATH
+
+    return model_path, platform, do_quant, output_path
+
+if __name__ == '__main__':
+    model_path, platform, do_quant, output_path = parse_arg()
+
+    # Create RKNN object
+    rknn = RKNN(verbose=False)
+
+    # Pre-process config
+    print('--> Config model')
+    rknn.config(mean_values=[[127.5, 127.5, 127.5]], std_values=[[127.5, 127.5, 127.5]], target_platform=platform)
+    print('done')
+
+    # Load model
+    print('--> Loading model')
+    ret = rknn.load_tensorflow(model_path, 
+                        inputs=['sub_7'],
+                        outputs=['logits/semantic/BiasAdd'],
+                        input_size_list=[[1,513,513,3]])
+    if ret != 0:
+        print('Load model failed!')
+        exit(ret)
+    print('done')
+
+    # Build model
+    print('--> Building model')
+    ret = rknn.build(do_quantization=do_quant, dataset=DATASET_PATH)
+    if ret != 0:
+        print('Build model failed!')
+        exit(ret)
+    print('done')
+
+    # Export rknn model
+    print('--> Export rknn model')
+    ret = rknn.export_rknn(output_path)
+    if ret != 0:
+        print('Export rknn model failed!')
+        exit(ret)
+    print('done')
+
+    # Release
+    rknn.release()
diff --git a/examples/deeplabv3/python/deeplabv3.py b/examples/deeplabv3/python/deeplabv3.py
index c649ea5..1345fa7 100644
--- a/examples/deeplabv3/python/deeplabv3.py
+++ b/examples/deeplabv3/python/deeplabv3.py
@@ -1,27 +1,21 @@
 '''
 Author: Chao Li 
 Date: 2023-10-23 09:19:52
-LastEditTime: 2023-11-29 15:47:23
+LastEditTime: 2024-01-29 14:41:42
 Editors: Chao Li 
 Description: Convert the deeplabv3 model trained by TensorFlow into RKNN model, and then use the RKNN model for inference.
 '''
 import numpy as np
 from matplotlib import pyplot as plt
 import cv2
-
-import sys
-
 from matplotlib import gridspec
-
+import torch
+import torch.nn.functional as F
 from rknn.api import RKNN
-
 import get_dataset_colormap
+import argparse
 
 TEST_IMG_PATH='../model/test_image.jpg'
-DATASET_PATH='../model/dataset.txt'
-DEFAULT_RKNN_PATH = '../model/deeplab-v3-plus-mobilenet-v2.rknn'
-DEFAULT_QUANT = True
-
 OUT_SIZE = 513
 
 LABEL_NAMES = np.asarray([
@@ -68,69 +62,35 @@ def vis_segmentation(image, seg_map):
     plt.show()
 
 
-def post_process(outputs):
-    seg_img = np.argmax(outputs, axis=-1)
-    seg_h = seg_img.shape[2]
-    seg_w = seg_img.shape[1]
-    seg_img = np.reshape(seg_img, (seg_w, seg_h)).astype(np.uint8)
-    seg_img = cv2.resize(seg_img, (OUT_SIZE, OUT_SIZE))
+def post_process(output):
+    output = np.transpose(output, (0, 3, 1, 2))
+    output = F.interpolate(torch.tensor(output), torch.Size(
+        [OUT_SIZE, OUT_SIZE]), mode='bilinear', align_corners=False)
+    output = np.transpose(output.numpy(), (0, 2, 3, 1))
+    seg_img = np.argmax(output, axis=-1)
+    seg_img = np.reshape(seg_img, (OUT_SIZE, OUT_SIZE)).astype(np.uint8)
 
     return seg_img
 
-def parse_arg():
-    if len(sys.argv) < 3:
-        print("Usage: python3 {} pb_model_path [platform] [dtype(optional)] [output_rknn_path(optional)] [plot/save(optional)]".format(sys.argv[0]));
-        print("       platform choose from [rk3562,rk3566,rk3568,rk3588]")
-        print("       dtype choose from    [i8, fp]")
-        exit(1)
-
-    model_path = sys.argv[1]
-    platform = sys.argv[2]
-
-    do_quant = DEFAULT_QUANT
-    if len(sys.argv) > 3:
-        model_type = sys.argv[3]
-        if model_type not in ['i8', 'fp']:
-            print("ERROR: Invalid model type: {}".format(model_type))
-            exit(1)
-        elif model_type == 'i8':
-            do_quant = True
-        else:
-            do_quant = False
-
-    if len(sys.argv) > 4:
-        output_path = sys.argv[4]
-    else:
-        output_path = DEFAULT_RKNN_PATH
-
-    if len(sys.argv) > 5:
-        plot_control = sys.argv[5]
-        assert plot_control in ['plot', 'save']
-    else:
-        plot_control = 'plot'
-
-    return model_path, platform, do_quant, output_path, plot_control
-
 if __name__ == '__main__':
-    model_path, platform, do_quant, output_path, plot_control = parse_arg()
+    parser = argparse.ArgumentParser(description='deeplabv3 Python Demo', add_help=True)
+    # basic params
+    parser.add_argument('--model_path', type=str, required=True,
+                        help='model path, could be .rknn file')
+    parser.add_argument('--target', type=str,
+                        default='rk3566', help='target RKNPU platform')
+    parser.add_argument('--device_id', type=str,
+                        default=None, help='device id')
+    args = parser.parse_args()
 
     # Create RKNN object
     rknn = RKNN()
 
-    rknn.config(mean_values=[127.5, 127.5, 127.5], std_values=[127.5, 127.5, 127.5], quant_img_RGB2BGR=False,target_platform=platform)
-
-    # Load model
-    print('--> Loading model')
-
-    rknn.load_tensorflow(model_path, 
-                        inputs=['sub_7'],
-                        outputs=['logits/semantic/BiasAdd'],
-                        input_size_list=[[1,513,513,3]])
-    print('done')
-
-    # Build model
-    print('--> Building model')
-    rknn.build(do_quantization=do_quant , dataset=DATASET_PATH)
+    # Load RKNN model
+    ret = rknn.load_rknn(args.model_path)
+    if ret != 0:
+        print('Load RKNN model \"{}\" failed!'.format(args.model_path))
+        exit(ret)
     print('done')
 
     # Set inputs
@@ -140,7 +100,7 @@ def parse_arg():
 
     # init runtime environment
     print('--> Init runtime environment')
-    ret = rknn.init_runtime()
+    ret = rknn.init_runtime(target=args.target, device_id=args.device_id)
     if ret != 0:
         print('Init runtime environment failed')
         exit(ret)
@@ -153,16 +113,15 @@ def parse_arg():
 
     seg_map = post_process(outputs[0])
 
-    if plot_control == 'plot':
-        vis_segmentation(img, seg_map)
-    elif plot_control == 'save':
-        seg_img = get_dataset_colormap.label_to_color_image(
-            seg_map, get_dataset_colormap.get_pascal_name()).astype(np.uint8)
-        overlay = img*0.5 + seg_img*0.5
-        overlay = overlay.astype(np.uint8)
-        overlay = cv2.cvtColor(overlay, cv2.COLOR_RGB2BGR)
-        cv2.imwrite('output.png', overlay)
-
-    rknn.export_rknn(output_path)
+    # plot img
+    vis_segmentation(img, seg_map)
+
+    # save result
+    seg_img = get_dataset_colormap.label_to_color_image(
+        seg_map, get_dataset_colormap.get_pascal_name()).astype(np.uint8)
+    overlay = img*0.5 + seg_img*0.5
+    overlay = overlay.astype(np.uint8)
+    overlay = cv2.cvtColor(overlay, cv2.COLOR_RGB2BGR)
+    cv2.imwrite('output.png', overlay)
 
     rknn.release()
diff --git a/examples/deeplabv3/reference_results/python_demo_result.png b/examples/deeplabv3/reference_results/python_demo_result.png
index 1ac5f0c..42e0b63 100644
Binary files a/examples/deeplabv3/reference_results/python_demo_result.png and b/examples/deeplabv3/reference_results/python_demo_result.png differ
diff --git a/examples/lite_transformer/README.md b/examples/lite_transformer/README.md
index 459c517..327266c 100644
--- a/examples/lite_transformer/README.md
+++ b/examples/lite_transformer/README.md
@@ -28,7 +28,7 @@ https://github.com/airockchip/lite-transformer
 
 ## 2. Current Support Platform
 
-RK3566, RK3568, RK3588, RK3562
+RK3566, RK3568, RK3588, RK3562, RK1808, RV1109, RV1126
 
 
 
@@ -67,36 +67,19 @@ python convert.py ../model/lite-transformer-decoder-16.onnx rk3588
 
 - `<onnx_model>`: Specify ONNX model path.
 - `<TARGET_PLATFORM>`: Specify NPU platform name. Support Platform refer [here](#2 Current Support Platform).
-- `<dtype>(optional)`: Specify as `i8` or `fp`. `i8` for doing quantization, `fp` for no quantization. Default is `fp`. Currently not support `i8` lite transformer model in this version.
+- `<dtype>(optional)`: Specify as `i8`/`u8`, `fp`. `i8`/`u8` for doing quantization, `fp` for no quantization. Default is `fp`. Currently not support `i8`/`u8` lite transformer model in this version.
 - `<output_rknn_path>(optional)`: Specify save path for the RKNN model, default save in the same directory as ONNX model name with `rknn` suffix.
 
 
 
 ## 5. Android Demo
 
-#### 5.1 Compile and Build
-
-*Usage:*
-
-```sh
-# go back to the rknn_model_zoo root directory
-cd ../../
-export ANDROID_NDK_PATH=<android_ndk_path>
-
-./build-android.sh -t <TARGET_PLATFORM> -a <ARCH> -d lite_transformer
+**Note: RK1808, RV1109, RV1126 does not support Android.**
 
-# such as 
-./build-android.sh -t rk3588 -a arm64-v8a -d lite_transformer
-```
+#### 5.1 Compile and Build
 
-*Description:*
-- `<android_ndk_path>`: Specify Android NDK path.
-- `<TARGET_PLATFORM>`: Specify NPU platform name. Support Platform refer [here](#2 Current Support Platform).
-- `<ARCH>`: Specify device system architecture. To query device architecture, refer to the following command:
-	```shell
-	# Query architecture. For Android, ['arm64-v8a' or 'armeabi-v7a'] should shown in log.
-	adb shell cat /proc/version
-	```
+Please refer to the [Compilation_Environment_Setup_Guide](../../docs/Compilation_Environment_Setup_Guide.md#android-platform) document to setup a cross-compilation environment and complete the compilation of C/C++ Demo.  
+**Note: Please replace the model name with `lite_transformer`.**
 
 #### 5.2 Push demo files to device
 
@@ -122,36 +105,12 @@ export LD_LIBRARY_PATH=./lib
 
 
 
-
 ## 6. Linux Demo
 
 #### 6.1 Compile and Build
 
-*Usage:*
-
-```shell
-# go back to the rknn_model_zoo root directory
-cd ../../
-
-# if GCC_COMPILER not found while building, please set GCC_COMPILER path
-(optional)export GCC_COMPILER=<GCC_COMPILER_PATH>
-
-./build-linux.sh -t <TARGET_PLATFORM> -a <ARCH> -d lite_transformer
-
-# such as 
-./build-linux.sh -t rk3588 -a aarch64 -d lite_transformer
-```
-
-*Description:*
-
-- `<GCC_COMPILER_PATH>`: Specified as GCC_COMPILER path.
-- `<TARGET_PLATFORM>` : Specify NPU platform name. Support Platform refer [here](#2 Current Support Platform).
-- `<ARCH>`: Specify device system architecture. To query device architecture, refer to the following command: 
-  
-  ```shell
-  # Query architecture. For Linux, ['aarch64' or 'armhf'] should shown in log.
-  adb shell cat /proc/version
-  ```
+Please refer to the [Compilation_Environment_Setup_Guide](../../docs/Compilation_Environment_Setup_Guide.md#linux-platform) document to setup a cross-compilation environment and complete the compilation of C/C++ Demo.  
+**Note: Please replace the model name with `lite_transformer`.**
 
 #### 6.2 Push demo files to device
 
diff --git a/examples/lite_transformer/cpp/CMakeLists.txt b/examples/lite_transformer/cpp/CMakeLists.txt
index 790c230..beb9adc 100644
--- a/examples/lite_transformer/cpp/CMakeLists.txt
+++ b/examples/lite_transformer/cpp/CMakeLists.txt
@@ -20,18 +20,31 @@ file(GLOB SRCS ${CMAKE_CURRENT_SOURCE_DIR}/*.cc)
 include_directories(${CMAKE_SOURCE_DIR}/utils)
 include_directories(${CMAKE_SOURCE_DIR}/utils/cnpy)
 include_directories(${CMAKE_SOURCE_DIR}/utils/bpe_tools)
-include_directories(${CMAKE_SOURCE_DIR}/rknpu2/rkdemo_utils)
+
+
+if(TARGET_SOC STREQUAL "rk1808" OR TARGET_SOC STREQUAL "rv1109" OR TARGET_SOC STREQUAL "rv1126")
+    set(lite_transformer_file rknpu1/lite_transformer.cc)
+    set(rkdemo_utils rknpu1/rkdemo_utils/rknn_demo_utils.cc)
+    include_directories(${CMAKE_SOURCE_DIR}/rknpu1/rkdemo_utils)
+    add_definitions("-DRKNPU1")
+else()
+    set(lite_transformer_file rknpu2/lite_transformer.cc)
+    set(rkdemo_utils rknpu2/rkdemo_utils/rknn_demo_utils.cc)
+    include_directories(${CMAKE_SOURCE_DIR}/rknpu2/rkdemo_utils)
+endif()
+
 
 add_executable(${PROJECT_NAME}
     main.cc
-    rknpu2/lite_transformer.cc
-    rknpu2/rkdemo_utils/rknn_demo_utils.cc
+    ${lite_transformer_file}
+    ${rkdemo_utils}
     utils/bpe_tools/bpe_tools.cc
 )
 
 target_link_libraries(${PROJECT_NAME}
     fileutils
     ${LIBRKNNRT}
+    dl
 )
 
 if (CMAKE_SYSTEM_NAME STREQUAL "Android")
@@ -58,4 +71,4 @@ install(FILES ${CMAKE_CURRENT_SOURCE_DIR}/../model/cw_token_map_order.txt DESTIN
 install(FILES ${CMAKE_CURRENT_SOURCE_DIR}/../model/position_embed.bin DESTINATION model)
 install(FILES ${CMAKE_CURRENT_SOURCE_DIR}/../model/token_embed.bin DESTINATION model)
 file(GLOB RKNN_FILES "${CMAKE_CURRENT_SOURCE_DIR}/../model/*.rknn")
-install(FILES ${RKNN_FILES} DESTINATION model)
\ No newline at end of file
+install(FILES ${RKNN_FILES} DESTINATION model)
diff --git a/examples/lite_transformer/cpp/lite_transformer.h b/examples/lite_transformer/cpp/lite_transformer.h
index 4173dfe..826dbc0 100644
--- a/examples/lite_transformer/cpp/lite_transformer.h
+++ b/examples/lite_transformer/cpp/lite_transformer.h
@@ -25,6 +25,7 @@
 #define HEAD_NUM 4
 #define EMBEDDING_DIM 256
 #define DECODER_LAYER_NUM 3
+#define MAX_SENTENCE_LEN 16
 
 // #define DECODER_LAYER_NUM 6
 
diff --git a/examples/lite_transformer/cpp/rknpu1/lite_transformer.cc b/examples/lite_transformer/cpp/rknpu1/lite_transformer.cc
new file mode 100644
index 0000000..c38484a
--- /dev/null
+++ b/examples/lite_transformer/cpp/rknpu1/lite_transformer.cc
@@ -0,0 +1,672 @@
+// Copyright (c) 2023 by Rockchip Electronics Co., Ltd. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+
+/*-------------------------------------------
+                Includes
+-------------------------------------------*/
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+#include <sys/time.h>
+#include <dlfcn.h>
+#include <math.h>
+#define _BASETSD_H
+#include <ctype.h>
+#include <algorithm>
+
+#include <float.h>
+#include "type_half.h"
+
+#include "rknn_api.h"
+// #include "cnpy.h"
+
+#include "easy_timer.h"
+#include "bpe_tools.h"
+#include "rknn_demo_utils.h"
+#include "lite_transformer.h"
+
+#define USE_NORMAL_API 1
+
+/*-------------------------------------------
+                  Functions
+-------------------------------------------*/
+
+// static void save_npy(const char* output_path, float* output_data, rknn_tensor_attr* output_attr)
+// {
+//   std::vector<size_t> output_shape;
+
+//   for (uint32_t i = 0; i < output_attr->n_dims; ++i) {
+//     // output_shape.push_back(output_attr->dims[output_attr->n_dims - i - 1]); // toolkit1 is inverse
+//     output_shape.push_back(output_attr->dims[i]); // toolkit 2
+//   }
+
+//   cnpy::npy_save<float>(output_path, output_data, output_shape);
+// }
+
+
+int token_embeding(float *token_embed, float *position_embed, int *tokens, int len, float *embedding){
+    float scale = sqrt(EMBEDDING_DIM);
+    int pad = 1;
+    for (int i = 0; i < len; i++){
+        for (int j = 0; j < EMBEDDING_DIM; j++){
+            embedding[i * EMBEDDING_DIM + j] = token_embed[tokens[i] * EMBEDDING_DIM + j] * scale;
+        }
+    }
+
+    for (int i = 0; i < len; i++){
+        if (tokens[i] != 1){
+            pad++;
+        }
+        else{
+            pad = 1;
+        }
+        for (int j = 0; j < EMBEDDING_DIM; j++){
+            embedding[i * EMBEDDING_DIM + j] += position_embed[EMBEDDING_DIM * pad + j];
+        }
+    }
+    return 0;
+}
+
+
+int load_bin_fp32(const char* filename, float* data, int len)
+{
+    FILE *fp_token_embed = fopen(filename, "rb");
+    if (fp_token_embed != NULL) {
+        fread(data, sizeof(float), len, fp_token_embed);
+        fclose(fp_token_embed);
+    } else {
+        printf("Open %s fail!\n", filename);
+        return -1;
+    }
+    return 0;
+}
+
+
+int sentence_to_word(const char* sentence, char** word, int max_word_num_in_sentence, int max_word_len)
+{
+    int num_word = 0;
+    int c = 0;
+    for (int i = 0; i < max_word_num_in_sentence; i++)
+    {
+        memset(word[i], 0, max_word_len);
+    }
+    for (int i = 0; i < max_word_num_in_sentence; i++)
+    {
+        if (sentence[i] == ' ')
+        {
+            num_word++;
+            c = 0;
+            i++;
+        }
+        word[num_word][c] = sentence[i];
+        c++;
+    }
+    return num_word;
+}
+
+
+int decoder_token_2_word(int* output_token, char* strings, Bpe_Tools* bpe_tools)
+{
+    char predict_word[MAX_WORD_LEN];
+    for (int i = 1; i < MAX_WORD_LEN; i++)
+    {
+        memset(predict_word, 0x00, sizeof(predict_word));
+        if (output_token[i] == 2 or output_token[i] <= 0)
+        {
+            break;
+        }
+        bpe_tools->get_word_by_token(output_token[i], predict_word);
+        for (int j = 0; j < MAX_WORD_LEN; j++)
+        {
+            if (predict_word[j] == '@' and predict_word[j + 1] == '@')
+            {
+                predict_word[j] = 0;
+                predict_word[j + 1] = 0;
+                break;
+            }
+        }
+        // printf("%s", predict_word);
+        strcat(strings, predict_word);
+    }
+    // printf("\n");
+    // printf("===================================\n");
+    return 0;
+}
+
+// 1x4x16x64 -> 1x15x64x4
+int preprocess_prev_key_value(float *prev_data, float *save_data)
+{
+    float mid_data[MAX_SENTENCE_LEN * EMBEDDING_DIM];
+
+    // 1x4x16x64->1x16x64x4
+    for (int s = 0; s < MAX_SENTENCE_LEN * EMBEDDING_DIM / HEAD_NUM; s++)
+    {
+        for (int h = 0; h < HEAD_NUM; h++)
+        {
+            mid_data[s * HEAD_NUM + h] = save_data[h * MAX_SENTENCE_LEN * EMBEDDING_DIM / HEAD_NUM + s];
+        }
+    }
+
+    // 1x16x64x4->1x15x64x4
+    memcpy(prev_data, mid_data + EMBEDDING_DIM, (MAX_SENTENCE_LEN - 1) * EMBEDDING_DIM * sizeof(float));
+
+    return 0;
+}
+
+// 1x4x16x64 -> 1x15x64x4
+int preprocess_prev_key_value_half(half *prev_data, half *save_data)
+{
+    half mid_data[MAX_SENTENCE_LEN * EMBEDDING_DIM];
+
+    // 1x4x16x64->1x16x64x4
+    for (int s = 0; s < MAX_SENTENCE_LEN * EMBEDDING_DIM / HEAD_NUM; s++)
+    {
+        for (int h = 0; h < HEAD_NUM; h++)
+        {
+            mid_data[s * HEAD_NUM + h] = save_data[h * MAX_SENTENCE_LEN * EMBEDDING_DIM / HEAD_NUM + s];
+        }
+    }
+
+    // 1x16x64x4->1x15x64x4
+    memcpy(prev_data, mid_data + EMBEDDING_DIM, (MAX_SENTENCE_LEN - 1) * EMBEDDING_DIM * sizeof(half));
+
+    half prev_mid_data[(MAX_SENTENCE_LEN - 1) * EMBEDDING_DIM];
+    memcpy(prev_mid_data, prev_data, (MAX_SENTENCE_LEN - 1) * EMBEDDING_DIM * sizeof(half));
+
+    // 1x1x64x4 -> 1x4x15x64
+    // zero-copy need layout [1,4,15,64]
+    for (int h = 0; h < HEAD_NUM; h++) {
+        for (int s = 0; s < (MAX_SENTENCE_LEN - 1) * EMBEDDING_DIM / HEAD_NUM; s++) {
+            prev_data[h * (MAX_SENTENCE_LEN - 1) * EMBEDDING_DIM / HEAD_NUM + s] = prev_mid_data[s  * HEAD_NUM + h];
+        }
+    }
+
+    return 0;
+}
+
+int dump_float(const float *array, int count, bool is_in, int index)
+{
+    // open file
+    FILE *file = nullptr;
+
+    if (is_in) {
+        char name[64] = {0};
+        sprintf(name, "in_%d.tensor", index);
+        file = fopen(name, "w");
+    } else {
+        file = fopen("out.tensor", "w");
+    }
+
+    if (file == NULL)
+    {
+        perror("Error opening file");
+        return -1; // open fail
+    }
+
+    // write float data
+    for (int i = 0; i < count; ++i)
+    {
+        if (fprintf(file, "%f\n", array[i]) < 0)
+        {
+            perror("Error writing to file");
+            fclose(file);
+            return -1; // write fail
+        }
+    }
+
+    // close file
+    fclose(file);
+
+    return 0;
+}
+
+int rknn_nmt_process(
+                    rknn_lite_transformer_context_t* app_ctx,
+                    int* input_token,
+                    int* output_token)
+{
+    int ret = 0;
+
+    TIMER timer;
+    TIMER timer_total;
+
+    // share max buffer
+    float enc_embedding[app_ctx->enc_len * EMBEDDING_DIM];
+    float dec_embedding[app_ctx->dec_len * EMBEDDING_DIM];
+    float enc_mask[app_ctx->enc_len];
+    float dec_mask[app_ctx->dec_len];
+    float dec_enc_mask[app_ctx->dec_len];
+    int input_token_sorted[app_ctx->enc_len];
+    memset(enc_embedding, 0x00, sizeof(enc_embedding));
+    memset(dec_embedding, 0x00, sizeof(dec_embedding));
+    memset(enc_mask, 0x00, sizeof(enc_mask));
+    memset(dec_mask, 0x00, sizeof(dec_mask));
+    memset(dec_enc_mask, 0x00, sizeof(app_ctx->dec_len));
+
+    // init prev key
+    float prev_key[DECODER_LAYER_NUM][(app_ctx->dec_len-1) * EMBEDDING_DIM];
+    float prev_value[DECODER_LAYER_NUM][(app_ctx->dec_len-1) * EMBEDDING_DIM];
+    memset(prev_key, 0x00, sizeof(prev_key));
+    memset(prev_value, 0x00, sizeof(prev_value));
+
+    int input_token_give = 0;
+    for (int i=0; i<app_ctx->enc_len; i++){
+        if (input_token[i] <= 1){
+            break;
+        }
+        input_token_give++;
+    }
+#ifdef ENCODER_INPUT_TOKEN_RIGHTSIDE_ALIGN
+    // working as [22,33,1,1,1,1] -> [1,1,1,22,33,2]
+    memset(input_token_sorted, 0, app_ctx->enc_len*sizeof(int));
+    input_token_sorted[app_ctx->enc_len-1] = 2;
+    for (int i=0; i<input_token_give; i++){
+        input_token_sorted[app_ctx->enc_len-1 - input_token_give +i] = input_token[i];
+    }
+#else
+    // working as [22,33,1,1,1,1] -> [22,33,2,1,1,1]
+    memset(input_token_sorted, 0, app_ctx->enc_len * sizeof(int));
+    for (int i = 0; i < input_token_give; i++) {
+        input_token_sorted[app_ctx->enc_len - 1 - input_token_give + i] = input_token[i];
+    }
+    input_token_sorted[input_token_give] = 2;
+#endif
+
+    // gen encoder mask
+    printf("input tokens(all should > 0):\n");
+    for (int i=0; i< app_ctx->enc_len; i++){
+        if (input_token_sorted[i] == 0){
+            input_token_sorted[i] = 1;
+            enc_mask[i] = 1;
+        }
+        else if(input_token_sorted[i] == 1){
+            enc_mask[i] = 1;
+        }
+        else{
+            enc_mask[i] = 0;
+        }
+        printf(" %d", input_token_sorted[i]);
+    }
+    printf("\n");
+
+    // expand_encoder_mask
+    float enc_mask_expand[app_ctx->enc_len * app_ctx->enc_len];
+    memset(enc_mask_expand, 0x00, sizeof(enc_mask_expand));
+    for (int i=0; i<app_ctx->enc_len; i++){
+        for (int j=0; j<app_ctx->enc_len; j++){
+            enc_mask_expand[i*app_ctx->enc_len+j] = enc_mask[j];
+        }
+    }
+
+    token_embeding(app_ctx->nmt_tokens.enc_token_embed, app_ctx->nmt_tokens.enc_pos_embed, input_token_sorted, app_ctx->enc_len, enc_embedding);
+#if USE_NORMAL_API
+    app_ctx->enc.inputs[0].buf = enc_embedding;
+    app_ctx->enc.inputs[1].buf = enc_mask_expand;
+#else
+    float_to_half_array(enc_embedding, (half *)(app_ctx->enc.in_mems[0]->logical_addr), app_ctx->enc.in_attrs[0].n_elems);
+    float_to_half_array(enc_mask_expand, (half *)(app_ctx->enc.in_mems[1]->logical_addr), app_ctx->enc.in_attrs[1].n_elems);
+#endif
+
+    // Run Encoder
+    timer.tik();
+
+#if USE_NORMAL_API
+    ret = rknn_inputs_set(app_ctx->enc.ctx, app_ctx->enc.n_input, app_ctx->enc.inputs);
+    if (ret < 0)
+    {
+        printf("[enc]rknn_input_set fail! ret=%d\n", ret);
+        return -1;
+    }
+#endif
+
+    ret = rknn_run(app_ctx->enc.ctx, nullptr);
+    if (ret < 0){ printf("[enc] rknn_run fail! ret=%d\n", ret); return -1; }
+
+#if USE_NORMAL_API
+    ret = rknn_outputs_get(app_ctx->enc.ctx, app_ctx->enc.n_output, app_ctx->enc.outputs, nullptr);
+    if (ret < 0)
+    {
+        printf("[enc] rknn_outputs_get fail! ret=%d\n", ret);
+        return -1;
+    }
+#endif
+
+    timer.tok();
+    timer.print_time("rknn encoder run");
+
+
+#if USE_NORMAL_API
+    // prev_key / prev_value use NHWC format.
+    for (int i = 4; i < app_ctx->dec.n_input; i++) {
+        app_ctx->dec.inputs[i].fmt = RKNN_TENSOR_NHWC;
+    }
+#else
+    // reset memory for inputs / outputs of decoder
+    for (int in_index = 0; in_index < app_ctx->dec.n_input; in_index++) {
+        memset(app_ctx->dec.in_mems[in_index]->logical_addr, 0, app_ctx->dec.in_attrs[in_index].size);
+    }
+    for (int out_index = 0; out_index < app_ctx->dec.n_output; out_index++) {
+        memset(app_ctx->dec.out_mems[out_index]->logical_addr, 0, app_ctx->dec.out_attrs[out_index].size);
+    }
+#endif
+
+    // Inputs that does not change as the decoder iterates
+#if USE_NORMAL_API
+    app_ctx->dec.inputs[1].buf = app_ctx->enc.outputs[0].buf;
+    app_ctx->dec.inputs[2].buf = enc_mask;
+#else
+    memcpy(app_ctx->dec.in_mems[1]->logical_addr, app_ctx->enc.out_mems[0]->logical_addr, app_ctx->enc.out_attrs[0].size);
+    memcpy(app_ctx->dec.in_mems[2]->logical_addr, app_ctx->enc.in_mems[1]->logical_addr, app_ctx->dec.in_attrs[2].size);
+#endif
+
+    for (int i = 0; i < app_ctx->dec_len; i++)
+    {
+        output_token[i] = 1;
+    }
+    output_token[0] = 2;
+
+    // decoder run
+    timer_total.tik();
+    for (int num_iter = 0; num_iter < app_ctx->dec_len; num_iter++){
+        token_embeding(app_ctx->nmt_tokens.dec_token_embed, app_ctx->nmt_tokens.dec_pos_embed, output_token, num_iter + 1, dec_embedding);
+#if USE_NORMAL_API
+        app_ctx->dec.inputs[0].buf = dec_embedding + num_iter * EMBEDDING_DIM;
+#else
+        float_to_half_array(dec_embedding + num_iter * EMBEDDING_DIM, (half *)(app_ctx->dec.in_mems[0]->logical_addr), app_ctx->dec.in_attrs[0].n_elems);
+#endif
+
+        float mask;
+        for (int j = 0; j < app_ctx->dec_len; j++) {
+            if (j >= app_ctx->dec_len - 1 - num_iter) {
+                mask = 0;
+            } else {
+                mask = 1;
+            }
+            dec_mask[j] = mask;
+        }
+#if USE_NORMAL_API
+        app_ctx->dec.inputs[3].buf = dec_mask;
+#else
+        float_to_half_array(dec_mask, (half *)(app_ctx->dec.in_mems[3]->logical_addr), app_ctx->dec.in_attrs[3].n_elems);
+#endif
+
+        if (num_iter != 0) {
+#if USE_NORMAL_API
+            // copy previous output to input
+            preprocess_prev_key_value(prev_key[0], (float *)app_ctx->dec.outputs[1].buf);
+            preprocess_prev_key_value(prev_value[0], (float *)app_ctx->dec.outputs[2].buf);
+            preprocess_prev_key_value(prev_key[1], (float *)app_ctx->dec.outputs[3].buf);
+            preprocess_prev_key_value(prev_value[1], (float *)app_ctx->dec.outputs[4].buf);
+            preprocess_prev_key_value(prev_key[2], (float *)app_ctx->dec.outputs[5].buf);
+            preprocess_prev_key_value(prev_value[2], (float *)app_ctx->dec.outputs[6].buf);
+
+            rknn_outputs_release(app_ctx->dec.ctx, app_ctx->dec.n_output, app_ctx->dec.outputs);
+#else
+            // previous key 0
+            preprocess_prev_key_value_half((half *)(app_ctx->dec.in_mems[4]->logical_addr), (half *)(app_ctx->dec.out_mems[1]->logical_addr));
+            // previous value 0
+            preprocess_prev_key_value_half((half *)(app_ctx->dec.in_mems[5]->logical_addr), (half *)(app_ctx->dec.out_mems[2]->logical_addr));
+            // previous key 1
+            preprocess_prev_key_value_half((half *)(app_ctx->dec.in_mems[6]->logical_addr), (half *)(app_ctx->dec.out_mems[3]->logical_addr));
+            // previous value 1
+            preprocess_prev_key_value_half((half *)(app_ctx->dec.in_mems[7]->logical_addr), (half *)(app_ctx->dec.out_mems[4]->logical_addr));
+            // previous key 2
+            preprocess_prev_key_value_half((half *)(app_ctx->dec.in_mems[8]->logical_addr), (half *)(app_ctx->dec.out_mems[5]->logical_addr));
+            // previous value 2
+            preprocess_prev_key_value_half((half *)(app_ctx->dec.in_mems[9]->logical_addr), (half *)(app_ctx->dec.out_mems[6]->logical_addr));
+#endif
+        }
+
+#if USE_NORMAL_API
+        app_ctx->dec.inputs[4].buf = prev_key[0];
+        app_ctx->dec.inputs[5].buf = prev_value[0];
+        app_ctx->dec.inputs[6].buf = prev_key[1];
+        app_ctx->dec.inputs[7].buf = prev_value[1];
+        app_ctx->dec.inputs[8].buf = prev_key[2];
+        app_ctx->dec.inputs[9].buf = prev_value[2];
+
+        // set inputs
+        ret = rknn_inputs_set(app_ctx->dec.ctx, app_ctx->dec.n_input, app_ctx->dec.inputs);
+        if (ret < 0)
+        {
+            printf("[decoder] rknn_inputs_set fail! ret=%d\n", ret);
+            return -1;
+        }
+#endif
+
+        // Run
+        timer.tik();
+        ret = rknn_run(app_ctx->dec.ctx, nullptr);
+        timer.tok();
+        if (ret < 0){ printf("rknn_run fail! ret=%d\n", ret); return -1; }
+        timer.print_time("rknn decoder run");
+
+#if USE_NORMAL_API
+        // Get outputs
+        ret = rknn_outputs_get(app_ctx->dec.ctx, app_ctx->dec.n_output, app_ctx->dec.outputs, NULL);
+        if (ret < 0)
+        {
+            printf("rknn_decoder_outputs_get fail! ret=%d\n", ret);
+            return -1;
+        }
+#endif
+
+        // argmax
+        int max = 0;
+        float value;
+#if USE_NORMAL_API
+        float* decoder_result_array = (float*)(app_ctx->dec.outputs[0].buf);
+        value = decoder_result_array[0];
+        for (int index = 1; index < app_ctx->dec.out_attrs[0].dims[0]; index++){
+            if (decoder_result_array[index] > value){
+                value = decoder_result_array[index];
+                max = index;
+            }
+        }
+#else
+        half *decoder_result_array = (half *)(app_ctx->dec.out_mems[0]->logical_addr);
+        value = half_to_float(decoder_result_array[0]);
+        for (int index = 1; index < app_ctx->dec.out_attrs[0].dims[0]; index++) {
+            if (half_to_float(decoder_result_array[index]) > value){
+                value = half_to_float(decoder_result_array[index]);
+                max = index;
+            }
+        }
+#endif
+        //debug
+        // printf("argmax - index %d, value %f\n", max, value);
+        output_token[num_iter + 1] = max;
+        if (max == 2){ break;}
+    }
+    timer_total.tok();
+
+#if USE_NORMAL_API
+    rknn_outputs_release(app_ctx->enc.ctx, app_ctx->enc.n_output, app_ctx->enc.outputs);
+    rknn_outputs_release(app_ctx->dec.ctx, app_ctx->dec.n_output, app_ctx->dec.outputs);
+#endif
+
+    // for debug
+    int output_len=0;
+    printf("decoder output token: ");
+    for (int i = 0; i < app_ctx->dec_len; i++){
+        if (output_token[i] == 1){break;}
+        printf("%d ", output_token[i]);
+        output_len ++;
+    }
+    printf("\n");
+
+    timer.print_time("rknn decoder once run");
+    printf("decoder run %d times. ", output_len-1);
+    timer_total.print_time("cost");
+
+    return output_len;
+}
+
+
+int init_lite_transformer_model(const char* encoder_path,
+                                const char* decoder_path,
+                                const char* token_embed_path,
+                                const char* pos_embed_path,
+                                const char* bpe_dict_path,
+                                const char* token_dict_path,
+                                const char* common_word_path,
+                                DICT_ORDER_TYPE dict_order_type,
+                                rknn_lite_transformer_context_t* app_ctx)
+{
+    int ret = 0;
+    memset(app_ctx, 0x00, sizeof(rknn_lite_transformer_context_t));
+#if USE_NORMAL_API
+    app_ctx->enc.use_zp = false;
+    app_ctx->dec.use_zp = false;
+#else
+    app_ctx->enc.use_zp = true;
+    app_ctx->dec.use_zp = true;
+    printf("!!!!!!!!!!!!!!!!!!!!\n");
+    printf("If you want to use the zero-copy API, please ensure that the RKNN model used is a precompiled model.\n");
+    printf("You can refer to this example to precompile the model: \n");
+    printf("    https://github.com/rockchip-linux/rknn-toolkit/tree/master/examples/common_function_demos/export_rknn_precompile_model\n");
+    printf("!!!!!!!!!!!!!!!!!!!!\n");
+#endif
+
+    // Init encoder and decoder
+    printf("--> init rknn encoder %s\n", encoder_path);
+    ret = rkdemo_model_init(true, encoder_path, &(app_ctx->enc));
+    if (ret < 0)
+    {
+        printf("init encoder failed!\n");
+        return ret;
+    }
+
+    printf("--> init rknn decoder %s\n", decoder_path);
+    ret = rkdemo_model_init(false, decoder_path, &(app_ctx->dec));
+    if (ret < 0)
+    {
+        printf("init decoder failed.\n");
+        return ret;
+    }
+
+    // set pre-process / post-process configure
+    app_ctx->enc_len = app_ctx->enc.in_attrs[0].dims[1];
+    app_ctx->dec_len = app_ctx->dec.in_attrs[1].dims[1];
+
+    // init dict and bpe
+    int nmt_word_dict_len = app_ctx->dec.out_attrs[0].n_elems/ app_ctx->dec.out_attrs[0].dims[2];
+    app_ctx->nmt_tokens.enc_token_embed = (float*)malloc(nmt_word_dict_len* EMBEDDING_DIM * sizeof(float));
+    app_ctx->nmt_tokens.enc_pos_embed = (float*)malloc(POS_LEN* EMBEDDING_DIM * sizeof(float));
+    printf("--> load token embed: %s\n", token_embed_path);
+    ret = load_bin_fp32(token_embed_path, app_ctx->nmt_tokens.enc_token_embed, nmt_word_dict_len* EMBEDDING_DIM);
+    if (ret != 0){ return -1;}
+    printf("--> load pos embed: %s\n", pos_embed_path);
+    ret = load_bin_fp32(pos_embed_path, app_ctx->nmt_tokens.enc_pos_embed, POS_LEN* EMBEDDING_DIM);
+    if (ret != 0){ return -1;}
+    app_ctx->nmt_tokens.dec_token_embed = app_ctx->nmt_tokens.enc_token_embed;
+    app_ctx->nmt_tokens.dec_pos_embed = app_ctx->nmt_tokens.enc_pos_embed;
+
+
+    app_ctx->bpe_tools = new Bpe_Tools();
+    printf("--> load bpe_dict: %s\n", bpe_dict_path);
+    ret = app_ctx->bpe_tools->prepare_bpe_data(bpe_dict_path, dict_order_type);
+    if (ret != 0){ return -1;}
+    printf("--> load word dict: %s\n", token_dict_path);
+    ret = app_ctx->bpe_tools->prepare_token_data(token_dict_path, dict_order_type);
+    if (ret != 0){ return -1;}
+    app_ctx->bpe_tools->set_token_offset(4);
+
+    if (common_word_path != nullptr){
+        printf("--> load common word dict: %s\n", common_word_path);
+        ret = app_ctx->bpe_tools->prepare_common_word_data(common_word_path, CW_TOKEN);
+        if (ret != 0){ return -1;}
+    }
+
+    return 0;
+}
+
+
+int release_lite_transformer_model(rknn_lite_transformer_context_t* app_ctx)
+{
+    // Release
+    rkdemo_model_release(&app_ctx->enc);
+    rkdemo_model_release(&app_ctx->dec);
+    free(app_ctx->nmt_tokens.enc_token_embed);
+    free(app_ctx->nmt_tokens.enc_pos_embed);
+    delete app_ctx->bpe_tools;
+    return 0;
+}
+
+
+int inference_lite_transformer_model(rknn_lite_transformer_context_t* app_ctx,
+                                     const char* input_sentence,
+                                     char* output_sentence)
+{
+    TIMER timer;
+    char* input_word[MAX_WORD_NUM_IN_SENTENCE];
+    char* output_word[MAX_WORD_NUM_IN_SENTENCE];
+    for (int i = 0; i < MAX_WORD_NUM_IN_SENTENCE; i++)
+    {
+        input_word[i] = (char*)malloc(MAX_WORD_LEN);
+        output_word[i] = (char*)malloc(MAX_WORD_LEN);
+    }
+    timer.tik();
+    int num_word = sentence_to_word(input_sentence, input_word, MAX_WORD_NUM_IN_SENTENCE, MAX_WORD_LEN);
+
+    int token_list[100];
+    int token_list_len=0;
+    memset(token_list, 0, sizeof(token_list));
+    for (int i = 0; i <= num_word; i++)
+    {
+        int word_tokens[WORD_LEN_LIMIT];
+        int _tk_len = 0;
+        _tk_len = app_ctx->bpe_tools->bpe_and_tokenize(input_word[i], word_tokens);
+        for (int j = 0; j < _tk_len; j++)
+        {
+            token_list[token_list_len] = word_tokens[j];
+            token_list_len++;
+        }
+    }
+    timer.tok();
+    timer.print_time("bpe preprocess");
+
+    int max_input_len = app_ctx->enc_len;
+    if (token_list_len > max_input_len)
+    {
+        printf("\nWARNING: token_len(%d) > max_input_len(%d), only keep %d tokens!\n", token_list_len, max_input_len, max_input_len);
+        printf("Tokens all     :");
+        for (int i = 0; i < token_list_len; i++){printf(" %d", token_list[i]);}
+        printf("\n");
+        token_list_len = max_input_len;
+        printf("Tokens remains :");
+        for (int i = 0; i < token_list_len; i++){printf(" %d", token_list[i]);}
+        printf("\n");
+    }
+
+    int output_token[max_input_len];
+    memset(output_token, 0, sizeof(output_token));
+    int output_len = 0;
+    output_len = rknn_nmt_process(app_ctx, token_list, output_token);
+    if (output_len < 0) {
+        printf("rknn_nmt_process fail, please check log.\n");
+        return -1;
+    }
+
+    memset(output_sentence, 0, MAX_USER_INPUT_LEN);
+    int ret = 0;
+    ret = decoder_token_2_word(output_token, output_sentence, app_ctx->bpe_tools);
+
+    for (int i = 0; i < MAX_WORD_NUM_IN_SENTENCE; i++){
+        free(input_word[i]);
+        free(output_word[i]);
+    }
+    return 0;
+}
diff --git a/examples/lite_transformer/cpp/rknpu1/rkdemo_utils/rknn_demo_utils.cc b/examples/lite_transformer/cpp/rknpu1/rkdemo_utils/rknn_demo_utils.cc
new file mode 100644
index 0000000..ca9b169
--- /dev/null
+++ b/examples/lite_transformer/cpp/rknpu1/rkdemo_utils/rknn_demo_utils.cc
@@ -0,0 +1,309 @@
+#include <stdio.h>
+#include <stdlib.h>
+#include <string>
+#include <memory.h>
+#include <assert.h>
+
+// multi thread support
+#include <iostream>
+#include <thread>
+#include <vector>
+
+#include "rknn_demo_utils.h"
+
+static void dump_tensor_attr(rknn_tensor_attr *attr)
+{
+    printf("  index=%d, name=%s, n_dims=%d, dims=[%d, %d, %d, %d], n_elems=%d, size=%d, fmt=%s, type=%s, qnt_type=%s, "
+           "zp=%d, scale=%f\n",
+           attr->index, attr->name, attr->n_dims, attr->dims[3], attr->dims[2], attr->dims[1], attr->dims[0],
+           attr->n_elems, attr->size, get_format_string(attr->fmt), get_type_string(attr->type),
+           get_qnt_type_string(attr->qnt_type), attr->zp, attr->scale);
+}
+
+static unsigned char *load_data(FILE *fp, size_t ofst, size_t sz)
+{
+    unsigned char *data;
+    int ret;
+
+    data = NULL;
+
+    if (NULL == fp)
+    {
+        return NULL;
+    }
+
+    ret = fseek(fp, ofst, SEEK_SET);
+    if (ret != 0)
+    {
+        printf("blob seek failure.\n");
+        return NULL;
+    }
+
+    data = (unsigned char *)malloc(sz);
+    if (data == NULL)
+    {
+        printf("buffer malloc failure.\n");
+        return NULL;
+    }
+    ret = fread(data, 1, sz, fp);
+    return data;
+}
+
+static unsigned char *load_model(const char *filename, int *model_size)
+{
+
+    FILE *fp;
+    unsigned char *data;
+
+    fp = fopen(filename, "rb");
+    if (NULL == fp)
+    {
+        printf("Open file %s failed.\n", filename);
+        return NULL;
+    }
+
+    fseek(fp, 0, SEEK_END);
+    int size = ftell(fp);
+
+    data = load_data(fp, 0, size);
+
+    fclose(fp);
+
+    *model_size = size;
+    return data;
+}
+
+int set_io_attrs(const char *model_name, MODEL_INFO *model_info)
+{
+    int ret = 0;
+
+    // Query input/output num
+    rknn_input_output_num io_num;
+    ret = rknn_query(model_info->ctx, RKNN_QUERY_IN_OUT_NUM, &io_num, sizeof(rknn_input_output_num));
+    if (ret != RKNN_SUCC)
+    {
+        printf("[%s]: query io_num failed, ret=%d!\n", model_name, ret);
+        return -1;
+    }
+    printf("[%s]: in_num=%d, out_num=%d.\n", model_name, io_num.n_input, io_num.n_output);
+    model_info->n_input = io_num.n_input;
+    model_info->n_output = io_num.n_output;
+
+    // Query input tensors attribute
+    model_info->in_attrs = (rknn_tensor_attr *)malloc(sizeof(rknn_tensor_attr) * model_info->n_input);
+    memset(model_info->in_attrs, 0, sizeof(rknn_tensor_attr) * model_info->n_input);
+    printf("[%s]: input attributes:\n", model_name);
+    for (int i = 0; i < io_num.n_input; i++)
+    {
+        model_info->in_attrs[i].index = i;
+        ret = rknn_query(model_info->ctx, RKNN_QUERY_INPUT_ATTR, (model_info->in_attrs) + i, sizeof(rknn_tensor_attr));
+        if (ret != RKNN_SUCC)
+        {
+            printf("[%s]: query input attr failed, ret=%d!\n", model_name, ret);
+            return -1;
+        }
+        dump_tensor_attr((model_info->in_attrs) + i);
+    }
+
+    // Query output tensors attribute
+    model_info->out_attrs = (rknn_tensor_attr *)malloc(sizeof(rknn_tensor_attr) * model_info->n_output);
+    memset(model_info->out_attrs, 0, sizeof(rknn_tensor_attr) * model_info->n_output);
+    printf("[%s]: output attributes:\n", model_name);
+    for (int i = 0; i < io_num.n_output; i++)
+    {
+        model_info->out_attrs[i].index = i;
+        ret = rknn_query(model_info->ctx, RKNN_QUERY_OUTPUT_ATTR, (model_info->out_attrs) + i, sizeof(rknn_tensor_attr));
+        if (ret != RKNN_SUCC)
+        {
+            printf("[%s] query output attr failed, ret=%d!\n", model_name, ret);
+            return -1;
+        }
+        dump_tensor_attr((model_info->out_attrs) + i);
+    }
+
+    return 0;
+}
+
+int init_input_buffer_all(const char *model_name, MODEL_INFO *model_info)
+{
+    int ret = 0;
+
+    rknn_tensor_attr *in_attrs = model_info->in_attrs;
+
+    // create memory for inputs
+    if (model_info->use_zp)
+    {
+        // zero-copy
+        printf("[%s] use zero-copy api, please make sure the npu driver version >= 1.7.3\n", model_name);
+        model_info->in_mems = (rknn_tensor_mem **)malloc(sizeof(rknn_tensor_mem *) * model_info->n_input);
+        for (int i = 0; i < model_info->n_input; i++)
+        {
+            model_info->in_mems[i] = rknn_create_mem(model_info->ctx, in_attrs[i].size);
+            printf("[%s] init_input_buffer(zero_copy): node_index=%d, size: %d.\n", model_name, i, in_attrs[i].size);
+            ret = rknn_set_io_mem(model_info->ctx, model_info->in_mems[i], &(in_attrs[i]));
+            if (ret < 0)
+            {
+                printf("[%s] set_io_mem for input[%d] failed, ret=%d.\n", model_name, i, ret);
+                return -1;
+            }
+        }
+    }
+    else
+    {
+        // normal api
+        model_info->inputs = (rknn_input *)malloc(sizeof(rknn_input) * model_info->n_input);
+        memset(model_info->inputs, 0x0, sizeof(rknn_input) * model_info->n_input);
+        for (int i = 0; i < model_info->n_input; i++)
+        {
+            model_info->inputs[i].index = i;
+            model_info->inputs[i].type = RKNN_TENSOR_FLOAT32;
+            // input.fmt = RKNN_TENSOR_NHWC;
+            model_info->inputs[i].size = in_attrs[i].n_elems * sizeof(float);
+            // malloc buffer during inference.
+            printf("[%s] init_input_buffer(normal_api): node_index=%d, size: %d.\n", model_name, i, model_info->inputs[i].size);
+        }
+    }
+
+    return 0;
+}
+
+int init_output_buffer_all(const char *model_name, MODEL_INFO *model_info)
+{
+    int ret = 0;
+
+    rknn_tensor_attr *out_attrs = model_info->out_attrs;
+
+    // create memory for inputs
+    if (model_info->use_zp)
+    {
+        // zero-copy
+        printf("[%s] use zero-copy api, please make sure the npu driver version >= 1.7.3\n", model_name);
+        model_info->out_mems = (rknn_tensor_mem **)malloc(sizeof(rknn_tensor_mem *) * model_info->n_output);
+        for (int i = 0; i < model_info->n_output; i++)
+        {
+            model_info->out_mems[i] = rknn_create_mem(model_info->ctx, out_attrs[i].size);
+            printf("[%s] init_output_buffer(zero_copy): node_index=%d, size: %d.\n", model_name, i, out_attrs[i].size);
+            ret = rknn_set_io_mem(model_info->ctx, model_info->out_mems[i], &(out_attrs[i]));
+            if (ret < 0)
+            {
+                printf("[%s] set_io_mem for input[%d] failed, ret=%d.\n", model_name, i, ret);
+                return -1;
+            }
+        }
+    }
+    else
+    {
+        // normal api
+        model_info->outputs = (rknn_output *)malloc(sizeof(rknn_output) * model_info->n_output);
+        memset(model_info->outputs, 0x0, sizeof(rknn_output) * model_info->n_output);
+        for (int i = 0; i < model_info->n_output; i++)
+        {
+            model_info->outputs[i].want_float = 1;
+            printf("[%s] init_output_buffer(normal_api): node_index=%d, want_float: true.\n", model_name, i);
+        }
+    }
+
+    return 0;
+}
+
+int rkdemo_model_init(bool is_encoder, const char *model_path, MODEL_INFO *model_info)
+{
+    int ret = 0;
+    int model_data_size = 0;
+    const char *model_name;
+
+    if (model_path == nullptr)
+    {
+        printf("ERROR model path is null");
+        return -1;
+    }
+
+    // define model name used to debug
+    if (is_encoder)
+    {
+        model_name = "encoder";
+    }
+    else
+    {
+        model_name = "decoder";
+    }
+
+    // load model data
+    unsigned char *model_data = load_model(model_path, &model_data_size);
+
+    // init rknn context
+    ret = rknn_init(&(model_info->ctx), (void *)model_data, model_data_size, 0);
+    free(model_data);
+    if (ret < 0)
+    {
+        printf("[%s] init RKNN model failed! ret=%d\n", model_name, ret);
+        return -1;
+    }
+
+    // set input/output attributes
+    ret = set_io_attrs(model_name, model_info);
+    if (ret < 0)
+    {
+        return -1;
+    }
+
+    // init inputs/outputs buffer
+    ret = init_input_buffer_all(model_name, model_info);
+    if (ret < 0)
+    {
+        printf("[%s] init input buffer failed!", model_name);
+        return -1;
+    }
+    ret = init_output_buffer_all(model_name, model_info);
+    if (ret < 0)
+    {
+        printf("[%s] init output buffer failed!", model_name);
+        return -1;
+    }
+
+    return ret;
+}
+
+int rkdemo_model_release(MODEL_INFO *model_info)
+{
+    // release inputs/outputs buffer
+    if (model_info->use_zp)
+    {
+        for (int i = 0; i < model_info->n_input; i++)
+        {
+            rknn_destroy_mem(model_info->ctx, model_info->in_mems[i]);
+        }
+        free(model_info->in_mems);
+
+        for (int i = 0; i < model_info->n_output; i++)
+        {
+            rknn_destroy_mem(model_info->ctx, model_info->out_mems[i]);
+        }
+        free(model_info->out_mems);
+    }
+
+    // release inputs/outputs for normal api
+    if (model_info->inputs)
+    {
+        free(model_info->inputs);
+    }
+    if (model_info->outputs)
+    {
+        free(model_info->outputs);
+    }
+
+    // release inputs/output attributes
+    if (model_info->in_attrs)
+    {
+        free(model_info->in_attrs);
+    }
+    if (model_info->out_attrs)
+    {
+        free(model_info->out_attrs);
+    }
+
+    // destroy rknn_context
+    rknn_destroy(model_info->ctx);
+
+    return 0;
+}
\ No newline at end of file
diff --git a/examples/lite_transformer/cpp/rknpu1/rkdemo_utils/rknn_demo_utils.h b/examples/lite_transformer/cpp/rknpu1/rkdemo_utils/rknn_demo_utils.h
new file mode 100644
index 0000000..ef3b28c
--- /dev/null
+++ b/examples/lite_transformer/cpp/rknpu1/rkdemo_utils/rknn_demo_utils.h
@@ -0,0 +1,31 @@
+#ifndef _RKNN_DEMO_UTILS_H
+#define _RKNN_DEMO_UTILS_H
+
+#include <stdio.h>
+#include <stdlib.h>
+#include <string>
+#include <memory.h>
+#include <assert.h>
+
+#include "rknn_api.h"
+
+typedef struct _MODEL_INFO
+{
+    rknn_context ctx; // rknn context
+    bool use_zp;      // whether use zero copy api, default is true
+
+    uint32_t n_input;                     // input number
+    rknn_tensor_attr *in_attrs = nullptr; // input tensors` attribute
+    rknn_input *inputs = nullptr;         // rknn inputs, used for normal api
+    rknn_tensor_mem **in_mems = nullptr;  // inputs` memory, used for zero-copy api
+
+    uint32_t n_output;                     // output number
+    rknn_tensor_attr *out_attrs = nullptr; // output tensors` attribute
+    rknn_output *outputs = nullptr;        // rknn outputs, used for normal api
+    rknn_tensor_mem **out_mems = nullptr;  // outputs` memory, used for zero-copy api
+} MODEL_INFO;
+
+int rkdemo_model_init(bool is_encoder, const char *model_path, MODEL_INFO *model_info);
+int rkdemo_model_release(MODEL_INFO *model_info);
+
+#endif
\ No newline at end of file
diff --git a/examples/lite_transformer/python/convert.py b/examples/lite_transformer/python/convert.py
index a0fafc0..79eb7fe 100644
--- a/examples/lite_transformer/python/convert.py
+++ b/examples/lite_transformer/python/convert.py
@@ -1,16 +1,14 @@
 import sys
-import os
-
 from rknn.api import RKNN
 
 DEFAULT_QUANT = False
 
-
 def parse_arg():
     if len(sys.argv) < 3:
         print("Usage: python3 {} onnx_model_path [platform] [dtype(optional)] [output_rknn_path(optional)]".format(sys.argv[0]));
-        print("       platform choose from [rk3562,rk3566,rk3568,rk3588]")
-        print("       dtype choose from    [i8, fp]")
+        print("       platform choose from [rk3562,rk3566,rk3568,rk3588,rk1808,rv1109,rv1126]")
+        print("       dtype choose from [fp] for [rk3562,rk3566,rk3568,rk3588]")
+        print("       dtype choose from [fp] for [rk1808,rv1109,rv1126]")
         exit(1)
 
     model_path = sys.argv[1]
@@ -19,14 +17,9 @@ def parse_arg():
     do_quant = DEFAULT_QUANT
     if len(sys.argv) > 3:
         model_type = sys.argv[3]
-        if model_type not in ['i8', 'fp']:
+        if model_type not in ['fp']:
             print("ERROR: Invalid model type: {}".format(model_type))
             exit(1)
-        elif model_type == 'i8':
-            do_quant = True
-            assert False, "i8 quantization is not supported yet"
-        else:
-            do_quant = False
 
     if len(sys.argv) > 4:
         output_path = sys.argv[4]
diff --git a/examples/mobilenet/README.md b/examples/mobilenet/README.md
index 093a3e7..eb9a81a 100644
--- a/examples/mobilenet/README.md
+++ b/examples/mobilenet/README.md
@@ -20,14 +20,12 @@ cd model
 ```
 
 
-
 ## Script Usage
 
 *Usage:*
 ```shell
 cd python
-python mobilenet.py --model <onnx_model> --target <TARGET_PLATFORM>
-
+python mobilenet.py --model <onnx_model> --target <TARGET_PLATFORM> --output_path <output_path> --dtype <data_type>
 # such as: 
 python mobilenet.py --model ../model/mobilenetv2-12.onnx --target rk3588
 # output model will be saved as ../model/mobilenetv2-12.rknn
@@ -35,13 +33,15 @@ python mobilenet.py --model ../model/mobilenetv2-12.onnx --target rk3588
 *Description:*
 
 - <onnx_model> should be the ONNX model path.
-- <TARGET_PLATFORM>  could be specified as RK3562, RK3566, RK3568, RK3588, RV1103, RV1106 according to board SOC version.
-
-
+- <TARGET_PLATFORM>  could be specified as RK3562, RK3566, RK3568, RK3588, RV1103, RV1106, RK1808, RV1109, RV1126 according to board SOC version. Case insensitive.
+- <output_path> export path of RKNN model. **Optional, default is `mobilenet_v2.rknn`.**
+- <data_type> quantized data type. **Optional, defaul is `i8`.** `i8`/`u8` means do quantization, `fp32` means not do quantization.
 
 
 ## Android Demo
 
+**Note: RK1808, RV1109, RV1126 does not support Android.**
+
 ### Compiling && Building
 
 ```sh
@@ -103,8 +103,12 @@ cd ../../
 
 # such as 
 ./build-linux.sh -t rk3588 -a aarch64 -d mobilenet
-# such as 
+# such as
 ./build-linux.sh -t rv1106 -a armhf -d mobilenet
+# such as
+./build-linux.sh -t rk1808 -a aarch64 -d mobilenet
+# such as
+./build-linux.sh -t rv1126 -a armhf -d mobilenet
 ```
 - <GCC_COMPILER_PATH>: Specified as GCC_COMPILER path.
 
diff --git a/examples/mobilenet/cpp/CMakeLists.txt b/examples/mobilenet/cpp/CMakeLists.txt
index 57f12e3..7fe61fc 100644
--- a/examples/mobilenet/cpp/CMakeLists.txt
+++ b/examples/mobilenet/cpp/CMakeLists.txt
@@ -9,13 +9,16 @@ if (ENABLE_ASAN)
 	set (CMAKE_LINKER_FLAGS_DEBUG "${CMAKE_LINKER_FLAGS_DEBUG} -fno-omit-frame-pointer -fsanitize=address")
 endif ()
 
-set(rknpu2_mobilenet_file rknpu2/mobilenet.cc)
+set(mobilenet_file rknpu2/mobilenet.cc)
 if (TARGET_SOC STREQUAL "rv1106" OR TARGET_SOC STREQUAL "rv1103")
-    set(rknpu2_mobilenet_file rknpu2/mobilenet_rv1106_1103.cc)
+    set(mobilenet_file rknpu2/mobilenet_rv1106_1103.cc)
     add_definitions(-DRV1106_1103)
     #dma
     include_directories(${CMAKE_CURRENT_SOURCE_DIR}/../../../3rdparty/allocator/dma)
 endif()
+if (TARGET_SOC STREQUAL "rk1808" OR TARGET_SOC STREQUAL "rv1109" OR TARGET_SOC STREQUAL "rv1126")
+    set(mobilenet_file rknpu1/mobilenet.cc)
+endif()
 
 add_subdirectory(${CMAKE_CURRENT_SOURCE_DIR}/../../../3rdparty/ 3rdparty.out)
 add_subdirectory(${CMAKE_CURRENT_SOURCE_DIR}/../../../utils/ utils.out)
@@ -26,14 +29,14 @@ file(GLOB SRCS ${CMAKE_CURRENT_SOURCE_DIR}/*.cc)
 
 add_executable(${PROJECT_NAME}
     main.cc
-    ${rknpu2_mobilenet_file}
+    ${mobilenet_file}
 )
 
 target_link_libraries(${PROJECT_NAME}
     fileutils
     imageutils
-    imagedrawing
     ${LIBRKNNRT}
+    dl
 )
 
 if (CMAKE_SYSTEM_NAME STREQUAL "Android")
diff --git a/examples/mobilenet/cpp/main.cc b/examples/mobilenet/cpp/main.cc
index 22804ac..0671aec 100644
--- a/examples/mobilenet/cpp/main.cc
+++ b/examples/mobilenet/cpp/main.cc
@@ -76,6 +76,8 @@ int main(int argc, char** argv)
     dma_sync_cpu_to_device(rknn_app_ctx.img_dma_buf.dma_buf_fd);
     free(src_image.virt_addr);
     src_image.virt_addr = (unsigned char *)rknn_app_ctx.img_dma_buf.dma_buf_virt_addr;
+    src_image.fd = rknn_app_ctx.img_dma_buf.dma_buf_fd;
+    rknn_app_ctx.img_dma_buf.size = src_image.size;
 #endif
     
     int topk = 5;
diff --git a/examples/mobilenet/cpp/rknpu1/mobilenet.cc b/examples/mobilenet/cpp/rknpu1/mobilenet.cc
new file mode 100644
index 0000000..f3c1be8
--- /dev/null
+++ b/examples/mobilenet/cpp/rknpu1/mobilenet.cc
@@ -0,0 +1,276 @@
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+#include <math.h>
+
+#include "mobilenet.h"
+#include "common.h"
+#include "file_utils.h"
+#include "image_utils.h"
+
+static void dump_tensor_attr(rknn_tensor_attr* attr)
+{
+    printf("  index=%d, name=%s, n_dims=%d, dims=[%d, %d, %d, %d], n_elems=%d, size=%d, fmt=%s, type=%s, qnt_type=%s, "
+            "zp=%d, scale=%f\n",
+            attr->index, attr->name, attr->n_dims, attr->dims[3], attr->dims[2], attr->dims[1], attr->dims[0],
+            attr->n_elems, attr->size, get_format_string(attr->fmt), get_type_string(attr->type),
+            get_qnt_type_string(attr->qnt_type), attr->zp, attr->scale);
+}
+
+typedef struct {
+    float value;
+    int index;
+} element_t;
+
+static void swap(element_t* a, element_t* b) {
+    element_t temp = *a;
+    *a = *b;
+    *b = temp;
+}
+
+static int partition(element_t arr[], int low, int high) {
+    float pivot = arr[high].value;
+    int i = low - 1;
+
+    for (int j = low; j <= high - 1; j++) {
+        if (arr[j].value >= pivot) {
+            i++;
+            swap(&arr[i], &arr[j]);
+        }
+    }
+
+    swap(&arr[i + 1], &arr[high]);
+    return (i + 1);
+}
+
+static void quick_sort(element_t arr[], int low, int high) {
+    if (low < high) {
+        int pi = partition(arr, low, high);
+        quick_sort(arr, low, pi - 1);
+        quick_sort(arr, pi + 1, high);
+    }
+}
+
+static void softmax(float* array, int size) {
+    // Find the maximum value in the array
+    float max_val = array[0];
+    for (int i = 1; i < size; i++) {
+        if (array[i] > max_val) {
+            max_val = array[i];
+        }
+    }
+
+    // Subtract the maximum value from each element to avoid overflow
+    for (int i = 0; i < size; i++) {
+        array[i] -= max_val;
+    }
+
+    // Compute the exponentials and sum
+    float sum = 0.0;
+    for (int i = 0; i < size; i++) {
+        array[i] = expf(array[i]);
+        sum += array[i];
+    }
+
+    // Normalize the array by dividing each element by the sum
+    for (int i = 0; i < size; i++) {
+        array[i] /= sum;
+    }
+}
+
+static void get_topk_with_indices(float arr[], int size, int k, mobilenet_result* result) {
+
+    // Create an array of elements, saving values ​​and index numbers
+    element_t* elements = (element_t*)malloc(size * sizeof(element_t));
+    for (int i = 0; i < size; i++) {
+        elements[i].value = arr[i];
+        elements[i].index = i;
+    }
+
+    // Quick sort an array of elements
+    quick_sort(elements, 0, size - 1);
+
+    // Get the top K maximum values ​​and their index numbers
+    for (int i = 0; i < k; i++) {
+        result[i].score = elements[i].value;
+        result[i].cls = elements[i].index;
+    }
+
+    free(elements);
+}
+
+int init_mobilenet_model(const char* model_path, rknn_app_context_t* app_ctx)
+{
+    int ret;
+    int model_len = 0;
+    char* model;
+    rknn_context ctx = 0;
+
+    // Load RKNN Model
+    model_len = read_data_from_file(model_path, &model);
+    if (model == NULL) {
+        printf("load_model fail!\n");
+        return -1;
+    }
+
+    ret = rknn_init(&ctx, model, model_len, 0);
+    free(model);
+    if (ret < 0) {
+        printf("rknn_init fail! ret=%d\n", ret);
+        return -1;
+    }
+
+    // Get Model Input Output Number
+    rknn_input_output_num io_num;
+    ret = rknn_query(ctx, RKNN_QUERY_IN_OUT_NUM, &io_num, sizeof(io_num));
+    if (ret != RKNN_SUCC) {
+        printf("rknn_query fail! ret=%d\n", ret);
+        return -1;
+    }
+    printf("model input num: %d, output num: %d\n", io_num.n_input, io_num.n_output);
+
+    // Get Model Input Info
+    printf("input tensors:\n");
+    rknn_tensor_attr input_attrs[io_num.n_input];
+    memset(input_attrs, 0, sizeof(input_attrs));
+    for (int i = 0; i < io_num.n_input; i++) {
+        input_attrs[i].index = i;
+        ret = rknn_query(ctx, RKNN_QUERY_INPUT_ATTR, &(input_attrs[i]), sizeof(rknn_tensor_attr));
+        if (ret != RKNN_SUCC) {
+            printf("rknn_query fail! ret=%d\n", ret);
+            return -1;
+        }
+        dump_tensor_attr(&(input_attrs[i]));
+    }
+
+    // Get Model Output Info
+    printf("output tensors:\n");
+    rknn_tensor_attr output_attrs[io_num.n_output];
+    memset(output_attrs, 0, sizeof(output_attrs));
+    for (int i = 0; i < io_num.n_output; i++) {
+        output_attrs[i].index = i;
+        ret = rknn_query(ctx, RKNN_QUERY_OUTPUT_ATTR, &(output_attrs[i]), sizeof(rknn_tensor_attr));
+        if (ret != RKNN_SUCC) {
+            printf("rknn_query fail! ret=%d\n", ret);
+            return -1;
+        }
+        dump_tensor_attr(&(output_attrs[i]));
+    }
+
+    // Set to context
+    app_ctx->rknn_ctx = ctx;
+    app_ctx->io_num = io_num;
+    app_ctx->input_attrs = (rknn_tensor_attr*)malloc(io_num.n_input * sizeof(rknn_tensor_attr));
+    memcpy(app_ctx->input_attrs, input_attrs, io_num.n_input * sizeof(rknn_tensor_attr));
+    app_ctx->output_attrs = (rknn_tensor_attr*)malloc(io_num.n_output * sizeof(rknn_tensor_attr));
+    memcpy(app_ctx->output_attrs, output_attrs, io_num.n_output * sizeof(rknn_tensor_attr));
+
+    // Note: dims are arranged in reverse order for RKNPU1
+    //       if fmt is NCHW, dims = [W, H, C, N]
+    //       if fmt is NHWC, dims = [C, W, H, N]
+    if (input_attrs[0].fmt == RKNN_TENSOR_NCHW) {
+        printf("model is NCHW input fmt\n");
+        app_ctx->model_channel = input_attrs[0].dims[2];
+        app_ctx->model_height  = input_attrs[0].dims[1];
+        app_ctx->model_width   = input_attrs[0].dims[0];
+    } else {
+        printf("model is NHWC input fmt\n");
+        app_ctx->model_height  = input_attrs[0].dims[2];
+        app_ctx->model_width   = input_attrs[0].dims[1];
+        app_ctx->model_channel = input_attrs[0].dims[0];
+    }
+    printf("model input height=%d, width=%d, channel=%d\n",
+        app_ctx->model_height, app_ctx->model_width, app_ctx->model_channel);
+
+    return 0;
+}
+
+int release_mobilenet_model(rknn_app_context_t* app_ctx)
+{
+    if (app_ctx->input_attrs != NULL) {
+        free(app_ctx->input_attrs);
+        app_ctx->input_attrs = NULL;
+    }
+    if (app_ctx->output_attrs != NULL) {
+        free(app_ctx->output_attrs);
+        app_ctx->output_attrs = NULL;
+    }
+    if (app_ctx->rknn_ctx != 0) {
+        rknn_destroy(app_ctx->rknn_ctx);
+        app_ctx->rknn_ctx = 0;
+    }
+    return 0;
+}
+
+int inference_mobilenet_model(rknn_app_context_t* app_ctx, image_buffer_t* src_img, mobilenet_result* out_result, int topk)
+{
+    int ret;
+    image_buffer_t img;
+    rknn_input inputs[1];
+    rknn_output outputs[1];
+
+    memset(&img, 0, sizeof(image_buffer_t));
+    memset(inputs, 0, sizeof(inputs));
+    memset(outputs, 0, sizeof(outputs));
+
+    // Pre Process
+    img.width = app_ctx->model_width;
+    img.height = app_ctx->model_height;
+    img.format = IMAGE_FORMAT_RGB888;
+    img.size = get_image_size(&img);
+    img.virt_addr = (unsigned char*)malloc(img.size);
+    if (img.virt_addr == NULL) {
+        printf("malloc buffer size:%d fail!\n", img.size);
+        return -1;
+    }
+
+    ret = convert_image(src_img, &img, NULL, NULL, 0);
+    if (ret < 0) {
+        printf("convert_image fail! ret=%d\n", ret);
+        return -1;
+    }
+
+    // Set Input Data
+    inputs[0].index = 0;
+    inputs[0].type  = RKNN_TENSOR_UINT8;
+    inputs[0].fmt   = RKNN_TENSOR_NHWC;
+    inputs[0].size  = app_ctx->model_width * app_ctx->model_height * app_ctx->model_channel;
+    inputs[0].buf   = img.virt_addr;
+
+    ret = rknn_inputs_set(app_ctx->rknn_ctx, 1, inputs);
+    if (ret < 0) {
+        printf("rknn_input_set fail! ret=%d\n", ret);
+        return -1;
+    }
+
+    // Run
+    printf("rknn_run\n");
+    ret = rknn_run(app_ctx->rknn_ctx, nullptr);
+    if (ret < 0) {
+        printf("rknn_run fail! ret=%d\n", ret);
+        return -1;
+    }
+
+    // Get Output
+    outputs[0].want_float = 1;
+    ret = rknn_outputs_get(app_ctx->rknn_ctx, 1, outputs, NULL);
+    if (ret < 0) {
+        printf("rknn_outputs_get fail! ret=%d\n", ret);
+        goto out;
+    }
+
+    // Post Process
+    softmax((float*)outputs[0].buf, app_ctx->output_attrs[0].n_elems);
+
+    get_topk_with_indices((float*)outputs[0].buf, app_ctx->output_attrs[0].n_elems, topk, out_result);
+
+    // Remeber to release rknn output
+    rknn_outputs_release(app_ctx->rknn_ctx, 1, outputs);
+
+out:
+    if (img.virt_addr != NULL) {
+        free(img.virt_addr);
+    }
+
+    return ret;
+}
\ No newline at end of file
diff --git a/examples/mobilenet/cpp/rknpu2/mobilenet.cc b/examples/mobilenet/cpp/rknpu2/mobilenet.cc
index 0309ab5..95674d5 100644
--- a/examples/mobilenet/cpp/rknpu2/mobilenet.cc
+++ b/examples/mobilenet/cpp/rknpu2/mobilenet.cc
@@ -22,13 +22,13 @@ typedef struct {
     int index;
 } element_t;
 
-void swap(element_t* a, element_t* b) {
+static void swap(element_t* a, element_t* b) {
     element_t temp = *a;
     *a = *b;
     *b = temp;
 }
 
-int partition(element_t arr[], int low, int high) {
+static int partition(element_t arr[], int low, int high) {
     float pivot = arr[high].value;
     int i = low - 1;
 
@@ -43,7 +43,7 @@ int partition(element_t arr[], int low, int high) {
     return (i + 1);
 }
 
-void quick_sort(element_t arr[], int low, int high) {
+static void quick_sort(element_t arr[], int low, int high) {
     if (low < high) {
         int pi = partition(arr, low, high);
         quick_sort(arr, low, pi - 1);
@@ -51,7 +51,7 @@ void quick_sort(element_t arr[], int low, int high) {
     }
 }
 
-void softmax(float* array, int size) {
+static void softmax(float* array, int size) {
     // Find the maximum value in the array
     float max_val = array[0];
     for (int i = 1; i < size; i++) {
@@ -78,19 +78,19 @@ void softmax(float* array, int size) {
     }
 }
 
-void get_topk_with_indices(float arr[], int size, int k, mobilenet_result* result) {
+static void get_topk_with_indices(float arr[], int size, int k, mobilenet_result* result) {
 
-    // 创建元素数组，保存值和索引号
+    // Create an array of elements, saving values ​​and index numbers
     element_t* elements = (element_t*)malloc(size * sizeof(element_t));
     for (int i = 0; i < size; i++) {
         elements[i].value = arr[i];
         elements[i].index = i;
     }
 
-    // 对元素数组进行快速排序
+    // Quick sort an array of elements
     quick_sort(elements, 0, size - 1);
 
-    // 获取前K个最大值和它们的索引号
+    // Get the top K maximum values ​​and their index numbers
     for (int i = 0; i < k; i++) {
         result[i].score = elements[i].value;
         result[i].cls = elements[i].index;
@@ -184,10 +184,6 @@ int init_mobilenet_model(const char* model_path, rknn_app_context_t* app_ctx)
 
 int release_mobilenet_model(rknn_app_context_t* app_ctx)
 {
-    if (app_ctx->rknn_ctx != 0) {
-        rknn_destroy(app_ctx->rknn_ctx);
-        app_ctx->rknn_ctx = 0;
-    }
     if (app_ctx->input_attrs != NULL) {
         free(app_ctx->input_attrs);
         app_ctx->input_attrs = NULL;
@@ -196,6 +192,10 @@ int release_mobilenet_model(rknn_app_context_t* app_ctx)
         free(app_ctx->output_attrs);
         app_ctx->output_attrs = NULL;
     }
+    if (app_ctx->rknn_ctx != 0) {
+        rknn_destroy(app_ctx->rknn_ctx);
+        app_ctx->rknn_ctx = 0;
+    }
     return 0;
 }
 
diff --git a/examples/mobilenet/cpp/rknpu2/mobilenet_rv1106_1103.cc b/examples/mobilenet/cpp/rknpu2/mobilenet_rv1106_1103.cc
index 8991aab..6d397b3 100644
--- a/examples/mobilenet/cpp/rknpu2/mobilenet_rv1106_1103.cc
+++ b/examples/mobilenet/cpp/rknpu2/mobilenet_rv1106_1103.cc
@@ -21,13 +21,13 @@ typedef struct {
     int index;
 } element_t;
 
-void swap(element_t *a, element_t *b) {
+static void swap(element_t *a, element_t *b) {
     element_t temp = *a;
     *a = *b;
     *b = temp;
 }
 
-int partition(element_t arr[], int low, int high) {
+static int partition(element_t arr[], int low, int high) {
     float pivot = arr[high].value;
     int i = low - 1;
 
@@ -42,7 +42,7 @@ int partition(element_t arr[], int low, int high) {
     return (i + 1);
 }
 
-void quick_sort(element_t arr[], int low, int high) {
+static void quick_sort(element_t arr[], int low, int high) {
     if (low < high) {
         int pi = partition(arr, low, high);
         quick_sort(arr, low, pi - 1);
@@ -50,7 +50,7 @@ void quick_sort(element_t arr[], int low, int high) {
     }
 }
 
-void softmax(float *array, int size) {
+static void softmax(float *array, int size) {
     // Find the maximum value in the array
     float max_val = array[0];
     for (int i = 1; i < size; i++) {
@@ -77,19 +77,19 @@ void softmax(float *array, int size) {
     }
 }
 
-void get_topk_with_indices(float arr[], int size, int k, mobilenet_result *result) {
+static void get_topk_with_indices(float arr[], int size, int k, mobilenet_result *result) {
 
-    // 创建元素数组，保存值和索引号
+    // Create an array of elements, saving values ​​and index numbers
     element_t *elements = (element_t *)malloc(size * sizeof(element_t));
     for (int i = 0; i < size; i++) {
         elements[i].value = arr[i];
         elements[i].index = i;
     }
 
-    // 对元素数组进行快速排序
+    // Quick sort an array of elements
     quick_sort(elements, 0, size - 1);
 
-    // 获取前K个最大值和它们的索引号
+    // Get the top K maximum values ​​and their index numbers
     for (int i = 0; i < k; i++) {
         result[i].score = elements[i].value;
         result[i].cls = elements[i].index;
@@ -98,9 +98,10 @@ void get_topk_with_indices(float arr[], int size, int k, mobilenet_result *resul
     free(elements);
 }
 
-// 量化模型的npu输出结果为int8数据类型，后处理要按照int8数据类型处理
-// 如下提供了int8排布的NC1HWC2转换成float的nchw转换代码
-int NC1HWC2_int8_to_NCHW_float(const int8_t *src, float *dst, int *dims, int channel, int h, int w, int zp, float scale) {
+// The npu output result of the quantization model is of int8 data type,
+// and the post-processing must be processed according to the int8 data type.
+// The following provides the nchw conversion code for converting NC1HWC2 arranged in int8 into float.
+static int NC1HWC2_int8_to_NCHW_float(const int8_t *src, float *dst, int *dims, int channel, int h, int w, int zp, float scale) {
     int batch = dims[0];
     int C1 = dims[1];
     int C2 = dims[4];
@@ -224,10 +225,6 @@ int init_mobilenet_model(const char *model_path, rknn_app_context_t *app_ctx) {
 }
 
 int release_mobilenet_model(rknn_app_context_t *app_ctx) {
-    if (app_ctx->rknn_ctx != 0) {
-        rknn_destroy(app_ctx->rknn_ctx);
-        app_ctx->rknn_ctx = 0;
-    }
     if (app_ctx->input_attrs != NULL) {
         free(app_ctx->input_attrs);
         app_ctx->input_attrs = NULL;
@@ -239,15 +236,17 @@ int release_mobilenet_model(rknn_app_context_t *app_ctx) {
     for (int i = 0; i < app_ctx->io_num.n_input; i++) {
         if (app_ctx->input_mems[i] != NULL) {
             rknn_destroy_mem(app_ctx->rknn_ctx, app_ctx->input_mems[i]);
-            free(app_ctx->input_mems[i]);
         }
     }
     for (int i = 0; i < app_ctx->io_num.n_output; i++) {
         if (app_ctx->output_mems[i] != NULL) {
             rknn_destroy_mem(app_ctx->rknn_ctx, app_ctx->output_mems[i]);
-            free(app_ctx->output_mems[i]);
         }
     }
+    if (app_ctx->rknn_ctx != 0) {
+        rknn_destroy(app_ctx->rknn_ctx);
+        app_ctx->rknn_ctx = 0;
+    }
     return 0;
 }
 
@@ -261,8 +260,8 @@ int inference_mobilenet_model(rknn_app_context_t *app_ctx, image_buffer_t *src_i
     img.height = app_ctx->model_height;
     img.format = IMAGE_FORMAT_RGB888;
     img.size = get_image_size(&img);
-    img.virt_addr = (unsigned char *)app_ctx->input_mems[0]->virt_addr;
-    if (img.virt_addr == NULL) {
+    img.fd = app_ctx->input_mems[0]->fd;
+    if (img.virt_addr == NULL && img.fd == 0) {
         printf("malloc buffer size:%d fail!\n", img.size);
         return -1;
     }
diff --git a/examples/mobilenet/python/mobilenet.py b/examples/mobilenet/python/mobilenet.py
index 47c18b7..3568d05 100644
--- a/examples/mobilenet/python/mobilenet.py
+++ b/examples/mobilenet/python/mobilenet.py
@@ -16,6 +16,9 @@
 OUT_RKNN_PATH = MODEL_DIR + 'mobilenet_v2.rknn'
 CLASS_LABEL_PATH = MODEL_DIR + 'synset.txt'
 
+RKNPU1_TARGET = ['rk1808', 'rv1109', 'rv1126']
+
+
 def readable_speed(speed):
     speed_bytes = float(speed)
     speed_kbytes = speed_bytes / 1024
@@ -29,6 +32,7 @@ def readable_speed(speed):
     else:
         return "{:.2f} KB/s".format(speed_kbytes)
 
+
 def show_progress(blocknum, blocksize, totalsize):
     speed = (blocknum * blocksize) / (time.time() - start_time)
     speed_str = " Speed: {}".format(readable_speed(speed))
@@ -43,6 +47,7 @@ def show_progress(blocknum, blocksize, totalsize):
     f.flush()
     f.write('\r\n')
 
+
 def check_and_download_origin_model():
     global start_time
     if not os.path.exists(MODEL_PATH):
@@ -58,34 +63,47 @@ def check_and_download_origin_model():
             exit(-1)
         print('done')
 
+
 if __name__ == '__main__':
-    parser = argparse.ArgumentParser(description='MobileNet Python Demo', add_help=True)
-    parser.add_argument('--target', type=str, default='rk3566', help='RKNPU target platform')
-    parser.add_argument('--npu_device_test', action='store_true', default=False, help='Connected npu device run')
-    parser.add_argument('--accuracy_analysis', action='store_true', default=False, help='Accuracy analysis')
-    parser.add_argument('--eval_perf', action='store_true', default=False, help='Time consuming evaluation')
-    parser.add_argument('--eval_memory', action='store_true', default=False, help='Memory evaluation')
-    parser.add_argument('--model', type=str, default=MODEL_PATH, help='onnx model path')
-    parser.add_argument('--output_path', type=str, default=OUT_RKNN_PATH, help='output rknn model path')
-    parser.add_argument('--dtype', type=str, default='i8', help='dtype of model, i8/fp32')
+    parser = argparse.ArgumentParser(
+        description='MobileNet Python Demo', add_help=True)
+    parser.add_argument('--target', type=str,
+                        default='rk3566', help='RKNPU target platform')
+    parser.add_argument('--npu_device_test', action='store_true',
+                        default=False, help='Connected npu device run')
+    parser.add_argument('--accuracy_analysis', action='store_true',
+                        default=False, help='Accuracy analysis')
+    parser.add_argument('--eval_perf', action='store_true',
+                        default=False, help='Time consuming evaluation')
+    parser.add_argument('--eval_memory', action='store_true',
+                        default=False, help='Memory evaluation')
+    parser.add_argument('--model', type=str,
+                        default=MODEL_PATH, help='onnx model path')
+    parser.add_argument('--output_path', type=str,
+                        default=OUT_RKNN_PATH, help='output rknn model path')
+    parser.add_argument('--dtype', type=str, default='i8',
+                        help='dtype of model, i8/fp32 for RKNPU2, u8/fp32 for RKNPU1')
     args = parser.parse_args()
 
     # Download model if not exist (from https://ftrg.zbox.filez.com/v2/delivery/data/95f00b0fc900458ba134f8b180b3f7a1/examples/MobileNet/mobilenetv2-12.onnx)
     check_and_download_origin_model()
 
     # Create RKNN object
-    rknn = RKNN(verbose=True)
+    rknn = RKNN(verbose=False)
 
     # Pre-process config
     print('--> Config model')
-    rknn.config(mean_values=[255*0.485, 255*0.456, 255*0.406], std_values=[255*0.229, 255*0.224, 255*0.225], target_platform=args.target)
+    rknn.config(mean_values=[[255*0.485, 255*0.456, 255*0.406]], std_values=[[
+                255*0.229, 255*0.224, 255*0.225]], target_platform=args.target)
     print('done')
 
     # Load model
     print('--> Loading model')
-    ret = rknn.load_onnx(model=args.model,
-                         inputs=['input'],
-                         input_size_list=[[1, 3, 224, 224]])
+    if args.target in RKNPU1_TARGET:
+        ret = rknn.load_onnx(model=args.model, inputs=['input'], input_size_list=[[3,224,224]])
+    else:
+        ret = rknn.load_onnx(model=args.model, inputs=['input'], input_size_list=[[1,3,224,224]])
+
     if ret != 0:
         print('Load model failed!')
         exit(ret)
@@ -93,7 +111,7 @@ def check_and_download_origin_model():
 
     # Build model
     print('--> Building model')
-    do_quant = True if args.dtype == 'i8' else False
+    do_quant = True if (args.dtype == 'i8' or args.dtype == 'u8') else False
     ret = rknn.build(do_quantization=do_quant, dataset=DATASET_PATH)
     if ret != 0:
         print('Build model failed!')
@@ -111,12 +129,21 @@ def check_and_download_origin_model():
     # Set inputs
     img = cv2.imread('../model/bell.jpg')
     img = cv2.resize(img, (224, 224))
+    img = np.expand_dims(img, 0)
 
     # Init runtime environment
     print('--> Init runtime environment')
-    if args.npu_device_test or args.eval_perf or args.eval_memory:
-        ret = rknn.init_runtime(target=args.target, perf_debug=True, eval_mem=True)
+    if args.npu_device_test or args.target in RKNPU1_TARGET:
+        # For RKNPU1, the simulator has beed disabled since version 1.7.5
+        ret = rknn.init_runtime(target=args.target)
+    elif args.eval_perf or args.eval_memory:
+        ret = rknn.init_runtime(
+            target=args.target, perf_debug=True, eval_mem=True)
     else:
+        if args.target in RKNPU1_TARGET:
+            print('The target {} does not support simulator.'.format(args.target))
+            print('Please set `--npu_device_test` to init runtime with real target.')
+            exit(-1)
         ret = rknn.init_runtime()
     if ret != 0:
         print('Init runtime environment failed!')
@@ -157,7 +184,8 @@ def check_and_download_origin_model():
     if args.accuracy_analysis:
         print('--> Accuracy analysis')
         if args.npu_device_test:
-            ret = rknn.accuracy_analysis(inputs=['../model/bell.jpg'], target=args.target)
+            ret = rknn.accuracy_analysis(
+                inputs=['../model/bell.jpg'], target=args.target)
         else:
             ret = rknn.accuracy_analysis(inputs=['../model/bell.jpg'])
         if ret != 0:
diff --git a/examples/ppseg/README.md b/examples/ppseg/README.md
index 65a1196..6e55d8c 100644
--- a/examples/ppseg/README.md
+++ b/examples/ppseg/README.md
@@ -1,11 +1,16 @@
 # PaddleSeg Model Demo
+
+## Current Support Platform
+RK3566, RK3568, RK3588, RK3562, RK1808, RV1109, RV1126
+
+
 ## Model Source
 
 Repository: [PaddleSeg](https://github.com/PaddlePaddle/PaddleSeg/tree/release/2.8)
 
 Download link: 
 
-[pp_liteseg_cityscapes.onnx](https://ftzr.zbox.filez.com/v2/delivery/data/95f00b0fc900458ba134f8b180b3f7a1/examples/ppseg/pp_liteseg_cityscapes.onnx)
+[pp_liteseg_cityscapes.onnx](https://ftzr.zbox.filez.com/v2/delivery/data/95f00b0fc900458ba134f8b180b3f7a1/examples/ppseg/pp_liteseg_cityscapes.onnx )
 
 Download with shell command:
 
@@ -37,12 +42,13 @@ python convert.py <onnx_model> <TARGET_PLATFORM> <dtype(optional)> <output_rknn_
 *Description:*
 
 - <onnx_model> should be the ONNX model path.
-- <TARGET_PLATFORM>  could be specified as RK3562, RK3566, RK3568, RK3588 according to board SOC version.
-- <dtype\> is *optional*, could be specified as `i8` or `fp`, `i8` means to do quantization, `fp` means no to do quantization, default is `i8`.
+- <TARGET_PLATFORM>  could be specified as RK3562, RK3566, RK3568, RK3588, RK1808, RV1109, RV1126 according to board SOC version.
+- <dtype\> is *optional*, could be specified as `i8`, `u8` or `fp`, `i8`/`u8` means to do quantization, `fp` means no to do quantization, default is `i8`/`u8`.
 - <output_rknn_path> is *optional*, used to specify the saving path of the RKNN model, default save path is `../model/pp_liteseg.rknn`
 
 
 ## Android Demo
+**Note: RK1808, RV1109, RV1126 does not support Android.**
 
 ### Compiling && Building
 
@@ -85,7 +91,7 @@ export LD_LIBRARY_PATH=./lib
 ### Pull result img
 
 ```
-adb pull /data/ppseg/result.png .
+adb pull /data/rknn_ppseg_demo/result.png .
 ```
 
 
@@ -148,3 +154,8 @@ export LD_LIBRARY_PATH=./lib:<LOCATION_LIBRGA>
 adb pull /data/rknn_ppseg_demo/result.png ./
 ```
 
+## Expected Results
+
+<img src="./result.png">
+
+- Note: Different platforms, different versions of tools and drivers may have slightly different results.
diff --git a/examples/ppseg/cpp/CMakeLists.txt b/examples/ppseg/cpp/CMakeLists.txt
index acd9c38..2744a84 100644
--- a/examples/ppseg/cpp/CMakeLists.txt
+++ b/examples/ppseg/cpp/CMakeLists.txt
@@ -14,11 +14,17 @@ add_subdirectory(${CMAKE_CURRENT_SOURCE_DIR}/../../../utils/ utils.out)
 
 set(CMAKE_INSTALL_RPATH "$ORIGIN/../lib")
 
+if (TARGET_SOC STREQUAL "rk1808" OR TARGET_SOC STREQUAL "rv1109" OR TARGET_SOC STREQUAL "rv1126")
+    set(ppseg_file rknpu1/ppseg.cc)
+else()
+    set(ppseg_file rknpu2/ppseg.cc)
+endif()
+
 file(GLOB SRCS ${CMAKE_CURRENT_SOURCE_DIR}/*.cc)
 
 add_executable(${PROJECT_NAME}
     main.cc
-    rknpu2/ppseg.cc
+    ${ppseg_file}
 )
 
 target_link_libraries(${PROJECT_NAME}
@@ -26,6 +32,7 @@ target_link_libraries(${PROJECT_NAME}
     imageutils
     imagedrawing
     ${LIBRKNNRT}
+    dl
 )
 
 if (CMAKE_SYSTEM_NAME STREQUAL "Android")
diff --git a/examples/ppseg/cpp/rknpu1/ppseg.cc b/examples/ppseg/cpp/rknpu1/ppseg.cc
new file mode 100644
index 0000000..8bca61a
--- /dev/null
+++ b/examples/ppseg/cpp/rknpu1/ppseg.cc
@@ -0,0 +1,270 @@
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+#include <math.h>
+#include <iostream>
+#include <chrono>
+#include "ppseg.h"
+#include "common.h"
+#include "file_utils.h"
+#include "image_utils.h"
+
+// Define the type of color
+using Color = std::tuple<int, int, int>;
+
+//Define a structure to represent a row of the table
+struct Entry {
+    int id;
+    const char* name;
+    Color color;
+};
+
+//Define a global table
+Entry cityscapes_label[] = {
+    {0, "road", Color(128, 64, 128)},
+    {1, "sidewalk", Color(244, 35, 232)},
+    {2, "building", Color(70, 70, 70)},
+    {3, "wall", Color(102, 102, 156)},
+    {4, "fence", Color(190, 153, 153)},
+    {5, "pole", Color(153, 153, 153)},
+    {6, "traffic light", Color(250, 170, 30)},
+    {7, "traffic sign", Color(220, 220, 0)},
+    {8, "vegetation", Color(107, 142, 35)},
+    {9, "terrain", Color(152, 251, 152)},
+    {10, "sky", Color(70, 130, 180)},
+    {11, "person", Color(220, 20, 60)},
+    {12, "rider", Color(255, 0, 0)},
+    {13, "car", Color(0, 0, 142)},
+    {14, "truck", Color(0, 0, 70)},
+    {15, "bus", Color(0, 60, 100)},
+    {16, "train", Color(0, 80, 100)},
+    {17, "motorcycle", Color(0, 0, 230)},
+    {18, "bicycle", Color(119, 11, 32)}
+};
+
+
+static void dump_tensor_attr(rknn_tensor_attr* attr)
+{
+    printf("  index=%d, name=%s, n_dims=%d, dims=[%d, %d, %d, %d], n_elems=%d, size=%d, fmt=%s, type=%s, qnt_type=%s, "
+            "zp=%d, scale=%f\n",
+            attr->index, attr->name, attr->n_dims, attr->dims[3], attr->dims[2], attr->dims[1], attr->dims[0],
+            attr->n_elems, attr->size, get_format_string(attr->fmt), get_type_string(attr->type),
+            get_qnt_type_string(attr->qnt_type), attr->zp, attr->scale);
+}
+
+Color getColorById(int id) {
+    for (const auto& entry : cityscapes_label) {
+        if (entry.id == id) {
+            return entry.color;
+        }
+    }
+    return Color(0, 0, 0);
+}
+
+int draw_segment_image(float* result, image_buffer_t* result_img)
+{
+    int height = result_img->height;
+    int width = result_img->width;
+    int num_class = 19;
+    result_img->virt_addr = (unsigned char*)malloc(3*height*width);
+    memset(result_img->virt_addr, 0, 3*height*width);
+    // [1,class,height,width] -> [1,3,height,width]
+    for (int batch = 0; batch < 1; batch++) {
+        for (int y = 0; y < height; y++) {
+            for (int x = 0; x < width; x++) {
+                int maxClassIndex = 0;
+                for (int c = 1; c < num_class; c++) {
+                    int currentIndex = batch * (num_class * height * width) + c * (height * width) + y * width + x;
+                    int maxClassPos = batch * (num_class * height * width) + maxClassIndex * (height * width) + y * width + x;
+                    if (result[currentIndex] > result[maxClassPos]) {
+                        maxClassIndex = c;
+                    }
+                }
+                Color foundColor = getColorById(maxClassIndex);
+
+                int imageIndex = batch * (3 * height * width) + y * width * 3 + x * 3;
+                result_img->virt_addr[imageIndex] = std::get<0>(foundColor);       // R
+                result_img->virt_addr[imageIndex + 1] = std::get<1>(foundColor);   // G
+                result_img->virt_addr[imageIndex + 2] = std::get<2>(foundColor);   // B
+            }
+        }
+    }
+    return 0;
+}
+
+int init_ppseg_model(const char* model_path, rknn_app_context_t* app_ctx)
+{
+    int ret;
+    int model_len = 0;
+    char* model;
+    rknn_context ctx = 0;
+
+    // Load RKNN Model
+    model_len = read_data_from_file(model_path, &model);
+    if (model == NULL) {
+        printf("load_model fail!\n");
+        return -1;
+    }
+
+    ret = rknn_init(&ctx, model, model_len, 0);
+    free(model);
+    if (ret < 0) {
+        printf("rknn_init fail! ret=%d\n", ret);
+        return -1;
+    }
+
+    // Get Model Input Output Number
+    rknn_input_output_num io_num;
+    ret = rknn_query(ctx, RKNN_QUERY_IN_OUT_NUM, &io_num, sizeof(io_num));
+    if (ret != RKNN_SUCC) {
+        printf("rknn_query fail! ret=%d\n", ret);
+        return -1;
+    }
+    printf("model input num: %d, output num: %d\n", io_num.n_input, io_num.n_output);
+
+    // Get Model Input Info
+    printf("input tensors:\n");
+    rknn_tensor_attr input_attrs[io_num.n_input];
+    memset(input_attrs, 0, sizeof(input_attrs));
+    for (int i = 0; i < io_num.n_input; i++) {
+        input_attrs[i].index = i;
+        ret = rknn_query(ctx, RKNN_QUERY_INPUT_ATTR, &(input_attrs[i]), sizeof(rknn_tensor_attr));
+        if (ret != RKNN_SUCC) {
+            printf("rknn_query fail! ret=%d\n", ret);
+            return -1;
+        }
+        dump_tensor_attr(&(input_attrs[i]));
+    }
+
+    // Get Model Output Info
+    printf("output tensors:\n");
+    rknn_tensor_attr output_attrs[io_num.n_output];
+    memset(output_attrs, 0, sizeof(output_attrs));
+    for (int i = 0; i < io_num.n_output; i++) {
+        output_attrs[i].index = i;
+        ret = rknn_query(ctx, RKNN_QUERY_OUTPUT_ATTR, &(output_attrs[i]), sizeof(rknn_tensor_attr));
+        if (ret != RKNN_SUCC) {
+            printf("rknn_query fail! ret=%d\n", ret);
+            return -1;
+        }
+        dump_tensor_attr(&(output_attrs[i]));
+    }
+
+    // Set to context
+    app_ctx->rknn_ctx = ctx;
+    app_ctx->io_num = io_num;
+    app_ctx->input_attrs = (rknn_tensor_attr*)malloc(io_num.n_input * sizeof(rknn_tensor_attr));
+    memcpy(app_ctx->input_attrs, input_attrs, io_num.n_input * sizeof(rknn_tensor_attr));
+    app_ctx->output_attrs = (rknn_tensor_attr*)malloc(io_num.n_output * sizeof(rknn_tensor_attr));
+    memcpy(app_ctx->output_attrs, output_attrs, io_num.n_output * sizeof(rknn_tensor_attr));
+
+    if (input_attrs[0].fmt == RKNN_TENSOR_NCHW) {
+        printf("model is NCHW input fmt\n");
+        app_ctx->model_channel = input_attrs[0].dims[2];
+        app_ctx->model_height  = input_attrs[0].dims[1];
+        app_ctx->model_width   = input_attrs[0].dims[0];
+    } else {
+        printf("model is NHWC input fmt\n");
+        app_ctx->model_height = input_attrs[0].dims[2];
+        app_ctx->model_width = input_attrs[0].dims[1];
+        app_ctx->model_channel = input_attrs[0].dims[0];
+    }
+    printf("model input height=%d, width=%d, channel=%d\n",
+        app_ctx->model_height, app_ctx->model_width, app_ctx->model_channel);
+
+    return 0;
+}
+
+int release_ppseg_model(rknn_app_context_t* app_ctx)
+{
+    if (app_ctx->input_attrs != NULL) {
+        free(app_ctx->input_attrs);
+        app_ctx->input_attrs = NULL;
+    }
+    if (app_ctx->output_attrs != NULL) {
+        free(app_ctx->output_attrs);
+        app_ctx->output_attrs = NULL;
+    }
+    if (app_ctx->rknn_ctx != 0)
+    {
+        rknn_destroy(app_ctx->rknn_ctx);
+        app_ctx->rknn_ctx = 0;
+    }
+    return 0;
+}
+
+int inference_ppseg_model(rknn_app_context_t* app_ctx, image_buffer_t* src_img, image_buffer_t* result_img)
+{
+    int ret;
+    image_buffer_t img;
+    rknn_input inputs[1];
+    rknn_output outputs[1];
+
+    memset(&img, 0, sizeof(image_buffer_t));
+    memset(inputs, 0, sizeof(inputs));
+    memset(outputs, 0, sizeof(outputs));
+
+    // Pre Process
+    img.width = app_ctx->model_width;
+    img.height = app_ctx->model_height;
+    img.format = IMAGE_FORMAT_RGB888;
+    img.size = get_image_size(&img);
+    img.virt_addr = (unsigned char*)malloc(img.size);
+    if (img.virt_addr == NULL) {
+        printf("malloc buffer size:%d fail!\n", img.size);
+        return -1;
+    }
+
+    ret = convert_image(src_img, &img, NULL, NULL, 0);
+    if (ret < 0) {
+        printf("convert_image fail! ret=%d\n", ret);
+        return -1;
+    }
+
+    // Set Input Data
+    inputs[0].index = 0;
+    inputs[0].type  = RKNN_TENSOR_UINT8;
+    inputs[0].fmt   = RKNN_TENSOR_NHWC;
+    inputs[0].size  = app_ctx->model_width * app_ctx->model_height * app_ctx->model_channel;
+    inputs[0].buf   = img.virt_addr;
+
+    
+    ret = rknn_inputs_set(app_ctx->rknn_ctx, 1, inputs);
+    if (ret < 0) {
+        printf("rknn_input_set fail! ret=%d\n", ret);
+        return -1;
+    }
+
+    // Run
+    printf("rknn_run\n");
+    auto start = std::chrono::high_resolution_clock::now();
+    ret = rknn_run(app_ctx->rknn_ctx, nullptr);
+    if (ret < 0) {
+        printf("rknn_run fail! ret=%d\n", ret);
+        return -1;
+    }
+    auto stop = std::chrono::high_resolution_clock::now();
+    auto duration = std::chrono::duration_cast<std::chrono::microseconds>(stop - start);
+    std::cout << "rknn run cost: " << float(duration.count()/1000.0) << " ms" << std::endl;
+
+    // Get Output
+    outputs[0].want_float = 1;
+    ret = rknn_outputs_get(app_ctx->rknn_ctx, 1, outputs, NULL);
+    if (ret < 0) {
+        printf("rknn_outputs_get fail! ret=%d\n", ret);
+        goto out;
+    }
+
+    // Post Process
+    // outputs -> take top1 pixel by pixel -> assign color
+    ret = draw_segment_image((float* )outputs[0].buf, result_img);
+    // Remeber to release rknn output
+    rknn_outputs_release(app_ctx->rknn_ctx, 1, outputs);
+
+out:
+    if (img.virt_addr != NULL) {
+        free(img.virt_addr);
+    }
+
+    return ret;
+}
\ No newline at end of file
diff --git a/examples/ppseg/cpp/rknpu2/ppseg.cc b/examples/ppseg/cpp/rknpu2/ppseg.cc
index 28808a4..16413d4 100644
--- a/examples/ppseg/cpp/rknpu2/ppseg.cc
+++ b/examples/ppseg/cpp/rknpu2/ppseg.cc
@@ -9,17 +9,17 @@
 #include "file_utils.h"
 #include "image_utils.h"
 
-// 定义颜色的类型
+// Define the type of color
 using Color = std::tuple<int, int, int>;
 
-// 定义一个结构体，表示表的一行
+//Define a structure to represent a row of the table
 struct Entry {
     int id;
     const char* name;
     Color color;
 };
 
-// 定义一个全局的表
+//Define a global table
 Entry cityscapes_label[] = {
     {0, "road", Color(128, 64, 128)},
     {1, "sidewalk", Color(244, 35, 232)},
@@ -177,10 +177,6 @@ int init_ppseg_model(const char* model_path, rknn_app_context_t* app_ctx)
 
 int release_ppseg_model(rknn_app_context_t* app_ctx)
 {
-    if (app_ctx->rknn_ctx != 0) {
-        rknn_destroy(app_ctx->rknn_ctx);
-        app_ctx->rknn_ctx = 0;
-    }
     if (app_ctx->input_attrs != NULL) {
         free(app_ctx->input_attrs);
         app_ctx->input_attrs = NULL;
@@ -189,6 +185,10 @@ int release_ppseg_model(rknn_app_context_t* app_ctx)
         free(app_ctx->output_attrs);
         app_ctx->output_attrs = NULL;
     }
+    if (app_ctx->rknn_ctx != 0) {
+        rknn_destroy(app_ctx->rknn_ctx);
+        app_ctx->rknn_ctx = 0;
+    }
     return 0;
 }
 
@@ -255,7 +255,7 @@ int inference_ppseg_model(rknn_app_context_t* app_ctx, image_buffer_t* src_img,
     }
 
     // Post Process
-    // outputs -> 逐像素取top1 -> 分配颜色
+    // outputs -> take top1 pixel by pixel -> assign color
     ret = draw_segment_image((float* )outputs[0].buf, result_img);
     // Remeber to release rknn output
     rknn_outputs_release(app_ctx->rknn_ctx, 1, outputs);
diff --git a/examples/ppseg/model/download_model.sh b/examples/ppseg/model/download_model.sh
index c8e059b..262ee12 100644
--- a/examples/ppseg/model/download_model.sh
+++ b/examples/ppseg/model/download_model.sh
@@ -1 +1 @@
-wget -O pp_liteseg_cityscapes.onnx https://ftzr.zbox.filez.com/v2/delivery/data/95f00b0fc900458ba134f8b180b3f7a1/examples/ppseg/pp_liteseg_cityscapes.onnx
+wget -O pp_liteseg_cityscapes.onnx https://ftzr.zbox.filez.com/v2/delivery/data/95f00b0fc900458ba134f8b180b3f7a1/examples/ppseg/pp_liteseg_cityscapes.onnx 
diff --git a/examples/ppseg/python/convert.py b/examples/ppseg/python/convert.py
index 0a09b09..f763711 100644
--- a/examples/ppseg/python/convert.py
+++ b/examples/ppseg/python/convert.py
@@ -1,44 +1,48 @@
-from rknn.api import RKNN
-import numpy as np
 import sys
+from rknn.api import RKNN
 
 DATASET_PATH = '../model/dataset.txt'
-MEAN = [[0.485*255, 0.456*255, 0.406*255]]
-STD = [[0.229*255, 0.224*255, 0.225*255]]
+DEFAULT_RKNN_PATH = '../model/pp_liteseg.rknn'
+DEFAULT_QUANT = True
 
-if __name__ == '__main__':
+def parse_arg():
     if len(sys.argv) < 3:
-        print("Usage: python3 {} pt_model_path [rk3566|rk3588|rk3562] [i8/fp (optional)] [output_path (optional)]".format(sys.argv[0]));
+        print("Usage: python3 {} onnx_model_path [platform] [dtype(optional)] [output_rknn_path(optional)]".format(sys.argv[0]));
+        print("       platform choose from [rk3562, rk3566, rk3568, rk3588, rk1808, rv1109, rv1126]")
+        print("       dtype choose from    [i8, fp] for [rk3562, rk3566, rk3568, rk3588]")
+        print("       dtype choose from    [u8, fp] for [rk1808, rv1109, rv1126]")
         exit(1)
 
     model_path = sys.argv[1]
     platform = sys.argv[2]
 
+    do_quant = DEFAULT_QUANT
     if len(sys.argv) > 3:
         model_type = sys.argv[3]
-        if model_type not in ['i8', 'fp']:
+        if model_type not in ['i8', 'u8', 'fp']:
             print("ERROR: Invalid model type: {}".format(model_type))
             exit(1)
-        elif model_type == 'i8':
+        elif model_type in ['i8', 'u8']:
             do_quant = True
-            print("ERROR: i8 quantization is not supported yet, ppseg-i8 drop accuracy!")
-            exit(1)
         else:
             do_quant = False
-    else:
-        do_quant = True
 
     if len(sys.argv) > 4:
         output_path = sys.argv[4]
     else:
-        output_path = '../model/pp_liteseg.rknn'
+        output_path = DEFAULT_RKNN_PATH
+
+    return model_path, platform, do_quant, output_path
+
+if __name__ == '__main__':
+    model_path, platform, do_quant, output_path = parse_arg()
 
     # Create RKNN object
     rknn = RKNN(verbose=False)
 
     # Pre-process config
     print('--> Config model')
-    rknn.config(mean_values=MEAN, std_values=STD, target_platform=platform)
+    rknn.config(mean_values=[[0.485*255, 0.456*255, 0.406*255]], std_values=[[0.229*255, 0.224*255, 0.225*255]], target_platform=platform)
     print('done')
 
     # Load model
diff --git a/examples/ppseg/result.png b/examples/ppseg/result.png
new file mode 100644
index 0000000..b803e19
Binary files /dev/null and b/examples/ppseg/result.png differ
diff --git a/examples/ppyoloe/README.md b/examples/ppyoloe/README.md
index a82f786..534e713 100644
--- a/examples/ppyoloe/README.md
+++ b/examples/ppyoloe/README.md
@@ -31,7 +31,7 @@ https://github.com/PaddlePaddle/PaddleDetection/blob/release/2.6/configs/ppyoloe
 
 ## 2. Current Support Platform
 
-RK3566, RK3568, RK3588, RK3562
+RK3566, RK3568, RK3588, RK3562, RK1808, RV1109, RV1126
 
 
 
@@ -67,7 +67,7 @@ python convert.py ../model/ppyoloe_s.onnx rk3588
 
 - `<onnx_model>`: Specify ONNX model path.
 - `<TARGET_PLATFORM>`: Specify NPU platform name. Support Platform refer [here](#2 Current Support Platform).
-- `<dtype>(optional)`: Specify as `i8` or `fp`. `i8` for doing quantization, `fp` for no quantization. Default is `i8`.
+- `<dtype>(optional)`: Specify as `i8`, `u8` or `fp`. `i8`/`u8` for doing quantization, `fp` for no quantization. Default is `i8`.
 - `<output_rknn_path>(optional)`: Specify save path for the RKNN model, default save in the same directory as ONNX model with name `ppyoloe.rknn`
 
 
@@ -95,29 +95,12 @@ python ppyoloe.py --model_path <rknn_model> --target <TARGET_PLATFORM> --img_sho
 
 ## 6. Android Demo
 
-#### 6.1 Compile and Build
-
-*Usage:*
-
-```sh
-# go back to the rknn_model_zoo root directory
-cd ../../
-export ANDROID_NDK_PATH=<android_ndk_path>
-
-./build-android.sh -t <TARGET_PLATFORM> -a <ARCH> -d ppyoloe
+**Note: RK1808, RV1109, RV1126 does not support Android.**
 
-# such as 
-./build-android.sh -t rk3588 -a arm64-v8a -d ppyoloe
-```
+#### 6.1 Compile and Build
 
-*Description:*
-- `<android_ndk_path>`: Specify Android NDK path.
-- `<TARGET_PLATFORM>`: Specify NPU platform name. Support Platform refer [here](#2 Current Support Platform).
-- `<ARCH>`: Specify device system architecture. To query device architecture, refer to the following command:
-	```shell
-	# Query architecture. For Android, ['arm64-v8a' or 'armeabi-v7a'] should shown in log.
-	adb shell cat /proc/version
-	```
+Please refer to the [Compilation_Environment_Setup_Guide](../../docs/Compilation_Environment_Setup_Guide.md#android-platform) document to setup a cross-compilation environment and complete the compilation of C/C++ Demo.  
+**Note: Please replace the model name with `ppyoloe`.**
 
 #### 6.2 Push demo files to device
 
@@ -151,31 +134,8 @@ export LD_LIBRARY_PATH=./lib
 
 #### 7.1 Compile and Build
 
-*Usage:*
-
-```shell
-# go back to the rknn_model_zoo root directory
-cd ../../
-
-# if GCC_COMPILER not found while building, please set GCC_COMPILER path
-(optional)export GCC_COMPILER=<GCC_COMPILER_PATH>
-
-./build-linux.sh -t <TARGET_PLATFORM> -a <ARCH> -d ppyoloe
-
-# such as 
-./build-linux.sh -t rk3588 -a aarch64 -d ppyoloe
-```
-
-*Description:*
-
-- `<GCC_COMPILER_PATH>`: Specified as GCC_COMPILER path.
-- `<TARGET_PLATFORM>` : Specify NPU platform name. Support Platform refer [here](#2 Current Support Platform).
-- `<ARCH>`: Specify device system architecture. To query device architecture, refer to the following command: 
-  
-  ```shell
-  # Query architecture. For Linux, ['aarch64' or 'armhf'] should shown in log.
-  adb shell cat /proc/version
-  ```
+Please refer to the [Compilation_Environment_Setup_Guide](../../docs/Compilation_Environment_Setup_Guide.md#linux-platform) document to setup a cross-compilation environment and complete the compilation of C/C++ Demo.  
+**Note: Please replace the model name with `ppyoloe`.**
 
 #### 7.2 Push demo files to device
 
@@ -222,4 +182,4 @@ person @ (103 243 116 285) 0.253
 
 <img src="result.png">
 
-- Note: Different platforms, different versions of tools and drivers may have slightly different results.
\ No newline at end of file
+- Note: Different platforms, different versions of tools and drivers may have slightly different results.
diff --git a/examples/ppyoloe/cpp/CMakeLists.txt b/examples/ppyoloe/cpp/CMakeLists.txt
index 8e2ce7a..991b884 100644
--- a/examples/ppyoloe/cpp/CMakeLists.txt
+++ b/examples/ppyoloe/cpp/CMakeLists.txt
@@ -16,10 +16,17 @@ set(CMAKE_INSTALL_RPATH "$ORIGIN/../lib")
 
 file(GLOB SRCS ${CMAKE_CURRENT_SOURCE_DIR}/*.cc)
 
+set(rknpu_ppyoloe_file rknpu2/ppyoloe.cc)
+
+if(TARGET_SOC STREQUAL "rk1808" OR TARGET_SOC STREQUAL "rv1109" OR TARGET_SOC STREQUAL "rv1126")
+    add_definitions(-DRKNPU1)
+    set(rknpu_ppyoloe_file rknpu1/ppyoloe.cc)
+endif()
+
 add_executable(${PROJECT_NAME}
     main.cc
     postprocess.cc
-    rknpu2/ppyoloe.cc
+    ${rknpu_ppyoloe_file}
 )
 
 target_link_libraries(${PROJECT_NAME}
@@ -27,6 +34,7 @@ target_link_libraries(${PROJECT_NAME}
     imageutils
     imagedrawing    
     ${LIBRKNNRT}
+    dl
 )
 
 if (CMAKE_SYSTEM_NAME STREQUAL "Android")
diff --git a/examples/ppyoloe/cpp/postprocess.cc b/examples/ppyoloe/cpp/postprocess.cc
index 9c99c9f..c953795 100644
--- a/examples/ppyoloe/cpp/postprocess.cc
+++ b/examples/ppyoloe/cpp/postprocess.cc
@@ -194,10 +194,18 @@ static int8_t qnt_f32_to_affine(float f32, int32_t zp, float scale)
     return res;
 }
 
+static uint8_t qnt_f32_to_affine_u8(float f32, int32_t zp, float scale)
+{
+    float dst_val = (f32 / scale) + zp;
+    uint8_t res = (uint8_t)__clip(dst_val, 0, 255);
+    return res;
+}
+
 static float deqnt_affine_to_f32(int8_t qnt, int32_t zp, float scale) { return ((float)qnt - (float)zp) * scale; }
 
+static float deqnt_affine_u8_to_f32(uint8_t qnt, int32_t zp, float scale) { return ((float)qnt - (float)zp) * scale; }
 
-void compute_dfl(float* tensor, int dfl_len, float* box){
+static void compute_dfl(float* tensor, int dfl_len, float* box){
     for (int b=0; b<4; b++){
         float exp_t[dfl_len];
         float exp_sum=0;
@@ -214,6 +222,80 @@ void compute_dfl(float* tensor, int dfl_len, float* box){
     }
 }
 
+static int process_u8(uint8_t *box_tensor, int32_t box_zp, float box_scale,
+                      uint8_t *score_tensor, int32_t score_zp, float score_scale,
+                      uint8_t *score_sum_tensor, int32_t score_sum_zp, float score_sum_scale,
+                      int grid_h, int grid_w, int stride, int dfl_len,
+                      std::vector<float> &boxes,
+                      std::vector<float> &objProbs,
+                      std::vector<int> &classId,
+                      float threshold)
+{
+    int validCount = 0;
+    int grid_len = grid_h * grid_w;
+    uint8_t score_thres_u8 = qnt_f32_to_affine_u8(threshold, score_zp, score_scale);
+    uint8_t score_sum_thres_u8 = qnt_f32_to_affine_u8(threshold, score_sum_zp, score_sum_scale);
+
+    for (int i = 0; i < grid_h; i++)
+    {
+        for (int j = 0; j < grid_w; j++)
+        {
+            int offset = i * grid_w + j;
+            int max_class_id = -1;
+
+            // Use score sum to quickly filter
+            if (score_sum_tensor != nullptr)
+            {
+                if (score_sum_tensor[offset] < score_sum_thres_u8)
+                {
+                    continue;
+                }
+            }
+
+            uint8_t max_score = -score_zp;
+            for (int c = 0; c < OBJ_CLASS_NUM; c++)
+            {
+                if ((score_tensor[offset] > score_thres_u8) && (score_tensor[offset] > max_score))
+                {
+                    max_score = score_tensor[offset];
+                    max_class_id = c;
+                }
+                offset += grid_len;
+            }
+
+            // compute box
+            if (max_score > score_thres_u8)
+            {
+                offset = i * grid_w + j;
+                float box[4];
+                float before_dfl[dfl_len * 4];
+                for (int k = 0; k < dfl_len * 4; k++)
+                {
+                    before_dfl[k] = deqnt_affine_u8_to_f32(box_tensor[offset], box_zp, box_scale);
+                    offset += grid_len;
+                }
+                compute_dfl(before_dfl, dfl_len, box);
+
+                float x1, y1, x2, y2, w, h;
+                x1 = (-box[0] + j + 0.5) * stride;
+                y1 = (-box[1] + i + 0.5) * stride;
+                x2 = (box[2] + j + 0.5) * stride;
+                y2 = (box[3] + i + 0.5) * stride;
+                w = x2 - x1;
+                h = y2 - y1;
+                boxes.push_back(x1);
+                boxes.push_back(y1);
+                boxes.push_back(w);
+                boxes.push_back(h);
+
+                objProbs.push_back(deqnt_affine_u8_to_f32(max_score, score_zp, score_scale));
+                classId.push_back(max_class_id);
+                validCount++;
+            }
+        }
+    }
+    return validCount;
+}
 
 static int process_i8(int8_t *box_tensor, int32_t box_zp, float box_scale,
                       int8_t *score_tensor, int32_t score_zp, float score_scale,
@@ -366,7 +448,11 @@ int post_process(rknn_app_context_t *app_ctx, rknn_output *outputs, letterbox_t
     memset(od_results, 0, sizeof(object_detect_result_list));
 
     // default 3 branch
+#ifdef RKNPU1
+    int dfl_len = app_ctx->output_attrs[0].dims[2] / 4;
+#else
     int dfl_len = app_ctx->output_attrs[0].dims[1] /4;
+#endif
     int output_per_branch = app_ctx->io_num.n_output / 3;
     for (int i = 0; i < 3; i++)
     {
@@ -382,17 +468,30 @@ int post_process(rknn_app_context_t *app_ctx, rknn_output *outputs, letterbox_t
         int box_idx = i*output_per_branch;
         int score_idx = i*output_per_branch + 1;
 
+#ifdef RKNPU1
+        grid_h = app_ctx->output_attrs[box_idx].dims[1];
+        grid_w = app_ctx->output_attrs[box_idx].dims[0];
+#else
         grid_h = app_ctx->output_attrs[box_idx].dims[2];
         grid_w = app_ctx->output_attrs[box_idx].dims[3];
+#endif
         stride = model_in_h / grid_h;
 
         if (app_ctx->is_quant)
         {
+#ifdef RKNPU1
+            validCount += process_u8((uint8_t *)outputs[box_idx].buf, app_ctx->output_attrs[box_idx].zp, app_ctx->output_attrs[box_idx].scale,
+                                     (uint8_t *)outputs[score_idx].buf, app_ctx->output_attrs[score_idx].zp, app_ctx->output_attrs[score_idx].scale,
+                                     (uint8_t *)score_sum, score_sum_zp, score_sum_scale,
+                                     grid_h, grid_w, stride, dfl_len,
+                                     filterBoxes, objProbs, classId, conf_threshold);
+#else
             validCount += process_i8((int8_t *)outputs[box_idx].buf, app_ctx->output_attrs[box_idx].zp, app_ctx->output_attrs[box_idx].scale,
                                      (int8_t *)outputs[score_idx].buf, app_ctx->output_attrs[score_idx].zp, app_ctx->output_attrs[score_idx].scale,
                                      (int8_t *)score_sum, score_sum_zp, score_sum_scale,
                                      grid_h, grid_w, stride, dfl_len, 
                                      filterBoxes, objProbs, classId, conf_threshold);
+#endif
         }
         else
         {
diff --git a/examples/ppyoloe/cpp/rknpu1/ppyoloe.cc b/examples/ppyoloe/cpp/rknpu1/ppyoloe.cc
new file mode 100644
index 0000000..e4ef43f
--- /dev/null
+++ b/examples/ppyoloe/cpp/rknpu1/ppyoloe.cc
@@ -0,0 +1,250 @@
+// Copyright (c) 2023 by Rockchip Electronics Co., Ltd. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+#include <math.h>
+
+#include "ppyoloe.h"
+#include "common.h"
+#include "file_utils.h"
+#include "image_utils.h"
+
+static void dump_tensor_attr(rknn_tensor_attr *attr)
+{
+    printf("  index=%d, name=%s, n_dims=%d, dims=[%d, %d, %d, %d], n_elems=%d, size=%d, fmt=%s, type=%s, qnt_type=%s, "
+           "zp=%d, scale=%f\n",
+           attr->index, attr->name, attr->n_dims, attr->dims[3], attr->dims[2], attr->dims[1], attr->dims[0],
+           attr->n_elems, attr->size, get_format_string(attr->fmt), get_type_string(attr->type),
+           get_qnt_type_string(attr->qnt_type), attr->zp, attr->scale);
+}
+
+int init_ppyoloe_model(const char *model_path, rknn_app_context_t *app_ctx)
+{
+    int ret;
+    int model_len = 0;
+    char *model;
+    rknn_context ctx = 0;
+
+    // Load RKNN Model
+    model_len = read_data_from_file(model_path, &model);
+    if (model == NULL)
+    {
+        printf("load_model fail!\n");
+        return -1;
+    }
+
+    ret = rknn_init(&ctx, model, model_len, 0);
+    free(model);
+    if (ret < 0)
+    {
+        printf("rknn_init fail! ret=%d\n", ret);
+        return -1;
+    }
+
+    // Get Model Input Output Number
+    rknn_input_output_num io_num;
+    ret = rknn_query(ctx, RKNN_QUERY_IN_OUT_NUM, &io_num, sizeof(io_num));
+    if (ret != RKNN_SUCC)
+    {
+        printf("rknn_query fail! ret=%d\n", ret);
+        return -1;
+    }
+    printf("model input num: %d, output num: %d\n", io_num.n_input, io_num.n_output);
+
+    // Get Model Input Info
+    printf("input tensors:\n");
+    rknn_tensor_attr input_attrs[io_num.n_input];
+    memset(input_attrs, 0, sizeof(input_attrs));
+    for (int i = 0; i < io_num.n_input; i++)
+    {
+        input_attrs[i].index = i;
+        ret = rknn_query(ctx, RKNN_QUERY_INPUT_ATTR, &(input_attrs[i]), sizeof(rknn_tensor_attr));
+        if (ret != RKNN_SUCC)
+        {
+            printf("rknn_query fail! ret=%d\n", ret);
+            return -1;
+        }
+        dump_tensor_attr(&(input_attrs[i]));
+    }
+
+    // Get Model Output Info
+    printf("output tensors:\n");
+    rknn_tensor_attr output_attrs[io_num.n_output];
+    memset(output_attrs, 0, sizeof(output_attrs));
+    for (int i = 0; i < io_num.n_output; i++)
+    {
+        output_attrs[i].index = i;
+        ret = rknn_query(ctx, RKNN_QUERY_OUTPUT_ATTR, &(output_attrs[i]), sizeof(rknn_tensor_attr));
+        if (ret != RKNN_SUCC)
+        {
+            printf("rknn_query fail! ret=%d\n", ret);
+            return -1;
+        }
+        dump_tensor_attr(&(output_attrs[i]));
+    }
+
+    // Set to context
+    app_ctx->rknn_ctx = ctx;
+
+    // TODO
+    if (output_attrs[0].qnt_type == RKNN_TENSOR_QNT_AFFINE_ASYMMETRIC && output_attrs[0].type == RKNN_TENSOR_UINT8)
+    {
+        app_ctx->is_quant = true;
+    }
+    else
+    {
+        app_ctx->is_quant = false;
+    }
+
+    app_ctx->io_num = io_num;
+    app_ctx->input_attrs = (rknn_tensor_attr *)malloc(io_num.n_input * sizeof(rknn_tensor_attr));
+    memcpy(app_ctx->input_attrs, input_attrs, io_num.n_input * sizeof(rknn_tensor_attr));
+    app_ctx->output_attrs = (rknn_tensor_attr *)malloc(io_num.n_output * sizeof(rknn_tensor_attr));
+    memcpy(app_ctx->output_attrs, output_attrs, io_num.n_output * sizeof(rknn_tensor_attr));
+
+    if (input_attrs[0].fmt == RKNN_TENSOR_NCHW)
+    {
+        printf("model is NCHW input fmt\n");
+        app_ctx->model_channel = input_attrs[0].dims[2];
+        app_ctx->model_height = input_attrs[0].dims[1];
+        app_ctx->model_width = input_attrs[0].dims[0];
+    }
+    else
+    {
+        printf("model is NHWC input fmt\n");
+        app_ctx->model_height = input_attrs[0].dims[2];
+        app_ctx->model_width = input_attrs[0].dims[1];
+        app_ctx->model_channel = input_attrs[0].dims[0];
+    }
+    printf("model input height=%d, width=%d, channel=%d\n",
+           app_ctx->model_height, app_ctx->model_width, app_ctx->model_channel);
+
+    return 0;
+}
+
+int release_ppyoloe_model(rknn_app_context_t *app_ctx)
+{
+    if (app_ctx->input_attrs != NULL)
+    {
+        free(app_ctx->input_attrs);
+        app_ctx->input_attrs = NULL;
+    }
+    if (app_ctx->output_attrs != NULL)
+    {
+        free(app_ctx->output_attrs);
+        app_ctx->output_attrs = NULL;
+    }
+    if (app_ctx->rknn_ctx != 0)
+    {
+        rknn_destroy(app_ctx->rknn_ctx);
+        app_ctx->rknn_ctx = 0;
+    }
+    return 0;
+}
+
+int inference_ppyoloe_model(rknn_app_context_t *app_ctx, image_buffer_t *img, object_detect_result_list *od_results)
+{
+    int ret;
+    image_buffer_t dst_img;
+    letterbox_t letter_box;
+    rknn_input inputs[app_ctx->io_num.n_input];
+    rknn_output outputs[app_ctx->io_num.n_output];
+    const float nms_threshold = NMS_THRESH;      // Default NMS threshold
+    const float box_conf_threshold = BOX_THRESH; // Default box threshold
+    int bg_color = 114;
+
+    if ((!app_ctx) || !(img) || (!od_results))
+    {
+        return -1;
+    }
+
+    memset(od_results, 0x00, sizeof(*od_results));
+    memset(&letter_box, 0, sizeof(letterbox_t));
+    memset(&dst_img, 0, sizeof(image_buffer_t));
+    memset(inputs, 0, sizeof(inputs));
+    memset(outputs, 0, sizeof(outputs));
+
+    // Pre Process
+    dst_img.width = app_ctx->model_width;
+    dst_img.height = app_ctx->model_height;
+    dst_img.format = IMAGE_FORMAT_RGB888;
+    dst_img.size = get_image_size(&dst_img);
+    dst_img.virt_addr = (unsigned char *)malloc(dst_img.size);
+    if (dst_img.virt_addr == NULL)
+    {
+        printf("malloc buffer size:%d fail!\n", dst_img.size);
+        return -1;
+    }
+
+    // letterbox
+    ret = convert_image_with_letterbox(img, &dst_img, &letter_box, bg_color);
+    if (ret < 0)
+    {
+        printf("convert_image_with_letterbox fail! ret=%d\n", ret);
+        return -1;
+    }
+
+    // Set Input Data
+    inputs[0].index = 0;
+    inputs[0].type = RKNN_TENSOR_UINT8;
+    inputs[0].fmt = RKNN_TENSOR_NHWC;
+    inputs[0].size = app_ctx->model_width * app_ctx->model_height * app_ctx->model_channel;
+    inputs[0].buf = dst_img.virt_addr;
+
+    ret = rknn_inputs_set(app_ctx->rknn_ctx, app_ctx->io_num.n_input, inputs);
+    if (ret < 0)
+    {
+        printf("rknn_input_set fail! ret=%d\n", ret);
+        return -1;
+    }
+
+    // Run
+    printf("rknn_run\n");
+    ret = rknn_run(app_ctx->rknn_ctx, nullptr);
+    if (ret < 0)
+    {
+        printf("rknn_run fail! ret=%d\n", ret);
+        return -1;
+    }
+
+    // Get Output
+    memset(outputs, 0, sizeof(outputs));
+    for (int i = 0; i < app_ctx->io_num.n_output; i++)
+    {
+        outputs[i].index = i;
+        outputs[i].want_float = (!app_ctx->is_quant);
+    }
+    ret = rknn_outputs_get(app_ctx->rknn_ctx, app_ctx->io_num.n_output, outputs, NULL);
+    if (ret < 0)
+    {
+        printf("rknn_outputs_get fail! ret=%d\n", ret);
+        goto out;
+    }
+
+    // Post Process
+    post_process(app_ctx, outputs, &letter_box, box_conf_threshold, nms_threshold, od_results);
+
+    // Remeber to release rknn output
+    rknn_outputs_release(app_ctx->rknn_ctx, app_ctx->io_num.n_output, outputs);
+
+out:
+    if (dst_img.virt_addr != NULL)
+    {
+        free(dst_img.virt_addr);
+    }
+
+    return ret;
+}
\ No newline at end of file
diff --git a/examples/ppyoloe/cpp/rknpu2/ppyoloe.cc b/examples/ppyoloe/cpp/rknpu2/ppyoloe.cc
index fdce081..bc85898 100644
--- a/examples/ppyoloe/cpp/rknpu2/ppyoloe.cc
+++ b/examples/ppyoloe/cpp/rknpu2/ppyoloe.cc
@@ -137,11 +137,6 @@ int init_ppyoloe_model(const char *model_path, rknn_app_context_t *app_ctx)
 
 int release_ppyoloe_model(rknn_app_context_t *app_ctx)
 {
-    if (app_ctx->rknn_ctx != 0)
-    {
-        rknn_destroy(app_ctx->rknn_ctx);
-        app_ctx->rknn_ctx = 0;
-    }
     if (app_ctx->input_attrs != NULL)
     {
         free(app_ctx->input_attrs);
@@ -152,6 +147,11 @@ int release_ppyoloe_model(rknn_app_context_t *app_ctx)
         free(app_ctx->output_attrs);
         app_ctx->output_attrs = NULL;
     }
+    if (app_ctx->rknn_ctx != 0)
+    {
+        rknn_destroy(app_ctx->rknn_ctx);
+        app_ctx->rknn_ctx = 0;
+    }
     return 0;
 }
 
diff --git a/examples/ppyoloe/python/convert.py b/examples/ppyoloe/python/convert.py
index 854bf4f..445c9d4 100644
--- a/examples/ppyoloe/python/convert.py
+++ b/examples/ppyoloe/python/convert.py
@@ -1,6 +1,4 @@
-import os
 import sys
-import numpy as np
 from rknn.api import RKNN
 
 DATASET_PATH = '../../../datasets/COCO/coco_subset_20.txt'
@@ -9,9 +7,10 @@
 
 def parse_arg():
     if len(sys.argv) < 3:
-        print("Usage: python3 {} onnx_model_path [platform] [dtype(optional)] [output_rknn_path(optional)]".format(sys.argv[0]));
-        print("       platform choose from [rk3562,rk3566,rk3568,rk3588]")
-        print("       dtype choose from    [i8, fp]")
+        print("Usage: python3 {} onnx_model_path [platform] [dtype(optional)] [output_rknn_path(optional)]".format(sys.argv[0]))
+        print("       platform choose from [rk3562,rk3566,rk3568,rk3588,rk1808,rv1109,rv1126]")
+        print("       dtype choose from    [i8, fp] for [rk3562,rk3566,rk3568,rk3588]")
+        print("       dtype choose from    [u8, fp] for [rk1808,rv1109,rv1126]")
         exit(1)
 
     model_path = sys.argv[1]
@@ -20,10 +19,10 @@ def parse_arg():
     do_quant = DEFAULT_QUANT
     if len(sys.argv) > 3:
         model_type = sys.argv[3]
-        if model_type not in ['i8', 'fp']:
+        if model_type not in ['i8', 'u8', 'fp']:
             print("ERROR: Invalid model type: {}".format(model_type))
             exit(1)
-        elif model_type == 'i8':
+        elif model_type in ['i8', 'u8']:
             do_quant = True
         else:
             do_quant = False
diff --git a/examples/resnet/README.md b/examples/resnet/README.md
index 7e429a0..459550f 100644
--- a/examples/resnet/README.md
+++ b/examples/resnet/README.md
@@ -32,8 +32,8 @@ python resnet.py ../model/resnet50-v2-7.onnx rk3588
 *Description:*
 
 - <onnx_model> should be the ONNX model path.
-- <TARGET_PLATFORM>  could be specified as RK3562, RK3566, RK3568, RK3588 according to board SOC version.
-- <dtype\> is *optional*, could be specified as `i8` or `fp`, `i8` means to do quantization, `fp` means no to do quantization, default is `i8`.
+- <TARGET_PLATFORM>  could be specified as RK3562, RK3566, RK3568, RK3588, RK1808, RV1109, RV1126 according to board SOC version.
+- <dtype\> is *optional*, could be specified as `i8`, `u8` or `fp`, `i8`/`u8` means to do quantization, `fp` means no to do quantization, default is `i8`.
 - <output_rknn_path> is *optional*, used to specify the saving path of the RKNN model, default save path is `../model/resnet50-v2-7.rknn`
 
 
@@ -41,6 +41,8 @@ python resnet.py ../model/resnet50-v2-7.onnx rk3588
 
 ## Android Demo
 
+**Note: RK1808, RV1109, RV1126 does not support Android.**
+
 ### Compiling && Building
 
 ```sh
diff --git a/examples/resnet/cpp/CMakeLists.txt b/examples/resnet/cpp/CMakeLists.txt
index 1678f01..98d3fd5 100644
--- a/examples/resnet/cpp/CMakeLists.txt
+++ b/examples/resnet/cpp/CMakeLists.txt
@@ -16,9 +16,14 @@ set(CMAKE_INSTALL_RPATH "$ORIGIN/../lib")
 
 file(GLOB SRCS ${CMAKE_CURRENT_SOURCE_DIR}/*.cc)
 
+set(resnet_file rknpu2/resnet.cc)
+if(TARGET_SOC STREQUAL "rk1808" OR TARGET_SOC STREQUAL "rv1109" OR TARGET_SOC STREQUAL "rv1126")
+    set(resnet_file rknpu1/resnet.cc)
+endif()
+
 add_executable(${PROJECT_NAME}
     main.cc
-    rknpu2/resnet.cc
+    ${resnet_file}
 )
 
 target_link_libraries(${PROJECT_NAME}
@@ -26,6 +31,7 @@ target_link_libraries(${PROJECT_NAME}
     imageutils
     imagedrawing
     ${LIBRKNNRT}
+    dl
 )
 
 if (CMAKE_SYSTEM_NAME STREQUAL "Android")
diff --git a/examples/resnet/cpp/rknpu1/resnet.cc b/examples/resnet/cpp/rknpu1/resnet.cc
new file mode 100644
index 0000000..c6824d2
--- /dev/null
+++ b/examples/resnet/cpp/rknpu1/resnet.cc
@@ -0,0 +1,275 @@
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+#include <math.h>
+
+#include "resnet.h"
+#include "common.h"
+#include "file_utils.h"
+#include "image_utils.h"
+
+static void dump_tensor_attr(rknn_tensor_attr* attr)
+{
+    printf("  index=%d, name=%s, n_dims=%d, dims=[%d, %d, %d, %d], n_elems=%d, size=%d, fmt=%s, type=%s, qnt_type=%s, "
+            "zp=%d, scale=%f\n",
+            attr->index, attr->name, attr->n_dims, attr->dims[3], attr->dims[2], attr->dims[1], attr->dims[0],
+            attr->n_elems, attr->size, get_format_string(attr->fmt), get_type_string(attr->type),
+            get_qnt_type_string(attr->qnt_type), attr->zp, attr->scale);
+}
+
+typedef struct {
+    float value;
+    int index;
+} element_t;
+
+static void swap(element_t* a, element_t* b) {
+    element_t temp = *a;
+    *a = *b;
+    *b = temp;
+}
+
+static int partition(element_t arr[], int low, int high) {
+    float pivot = arr[high].value;
+    int i = low - 1;
+
+    for (int j = low; j <= high - 1; j++) {
+        if (arr[j].value >= pivot) {
+            i++;
+            swap(&arr[i], &arr[j]);
+        }
+    }
+
+    swap(&arr[i + 1], &arr[high]);
+    return (i + 1);
+}
+
+static void quick_sort(element_t arr[], int low, int high) {
+    if (low < high) {
+        int pi = partition(arr, low, high);
+        quick_sort(arr, low, pi - 1);
+        quick_sort(arr, pi + 1, high);
+    }
+}
+
+static void softmax(float* array, int size) {
+    // Find the maximum value in the array
+    float max_val = array[0];
+    for (int i = 1; i < size; i++) {
+        if (array[i] > max_val) {
+            max_val = array[i];
+        }
+    }
+
+    // Subtract the maximum value from each element to avoid overflow
+    for (int i = 0; i < size; i++) {
+        array[i] -= max_val;
+    }
+
+    // Compute the exponentials and sum
+    float sum = 0.0;
+    for (int i = 0; i < size; i++) {
+        array[i] = expf(array[i]);
+        sum += array[i];
+    }
+
+    // Normalize the array by dividing each element by the sum
+    for (int i = 0; i < size; i++) {
+        array[i] /= sum;
+    }
+}
+
+static void get_topk_with_indices(float arr[], int size, int k, resnet_result* result) {
+
+    // Create an array of elements, saving values ​​and index numbers
+    element_t* elements = (element_t*)malloc(size * sizeof(element_t));
+    for (int i = 0; i < size; i++) {
+        elements[i].value = arr[i];
+        elements[i].index = i;
+    }
+
+    // Quick sort an array of elements
+    quick_sort(elements, 0, size - 1);
+
+    // Get the top K maximum values ​​and their index numbers
+    for (int i = 0; i < k; i++) {
+        result[i].score = elements[i].value;
+        result[i].cls = elements[i].index;
+    }
+
+    free(elements);
+}
+
+int init_resnet_model(const char* model_path, rknn_app_context_t* app_ctx)
+{
+    int ret;
+    int model_len = 0;
+    char* model;
+    rknn_context ctx = 0;
+
+    // Load RKNN Model
+    model_len = read_data_from_file(model_path, &model);
+    if (model == NULL) {
+        printf("load_model fail!\n");
+        return -1;
+    }
+
+    ret = rknn_init(&ctx, model, model_len, 0);
+    free(model);
+    if (ret < 0) {
+        printf("rknn_init fail! ret=%d\n", ret);
+        return -1;
+    }
+
+    // Get Model Input Output Number
+    rknn_input_output_num io_num;
+    ret = rknn_query(ctx, RKNN_QUERY_IN_OUT_NUM, &io_num, sizeof(io_num));
+    if (ret != RKNN_SUCC) {
+        printf("rknn_query fail! ret=%d\n", ret);
+        return -1;
+    }
+    printf("model input num: %d, output num: %d\n", io_num.n_input, io_num.n_output);
+
+    // Get Model Input Info
+    printf("input tensors:\n");
+    rknn_tensor_attr input_attrs[io_num.n_input];
+    memset(input_attrs, 0, sizeof(input_attrs));
+    for (int i = 0; i < io_num.n_input; i++) {
+        input_attrs[i].index = i;
+        ret = rknn_query(ctx, RKNN_QUERY_INPUT_ATTR, &(input_attrs[i]), sizeof(rknn_tensor_attr));
+        if (ret != RKNN_SUCC) {
+            printf("rknn_query fail! ret=%d\n", ret);
+            return -1;
+        }
+        dump_tensor_attr(&(input_attrs[i]));
+    }
+
+    // Get Model Output Info
+    printf("output tensors:\n");
+    rknn_tensor_attr output_attrs[io_num.n_output];
+    memset(output_attrs, 0, sizeof(output_attrs));
+    for (int i = 0; i < io_num.n_output; i++) {
+        output_attrs[i].index = i;
+        ret = rknn_query(ctx, RKNN_QUERY_OUTPUT_ATTR, &(output_attrs[i]), sizeof(rknn_tensor_attr));
+        if (ret != RKNN_SUCC) {
+            printf("rknn_query fail! ret=%d\n", ret);
+            return -1;
+        }
+        dump_tensor_attr(&(output_attrs[i]));
+    }
+
+    // Set to context
+    app_ctx->rknn_ctx = ctx;
+    app_ctx->io_num = io_num;
+    app_ctx->input_attrs = (rknn_tensor_attr*)malloc(io_num.n_input * sizeof(rknn_tensor_attr));
+    memcpy(app_ctx->input_attrs, input_attrs, io_num.n_input * sizeof(rknn_tensor_attr));
+    app_ctx->output_attrs = (rknn_tensor_attr*)malloc(io_num.n_output * sizeof(rknn_tensor_attr));
+    memcpy(app_ctx->output_attrs, output_attrs, io_num.n_output * sizeof(rknn_tensor_attr));
+
+    if (input_attrs[0].fmt == RKNN_TENSOR_NCHW) {
+        printf("model is NCHW input fmt\n");
+        app_ctx->model_channel = input_attrs[0].dims[2];
+        app_ctx->model_height  = input_attrs[0].dims[1];
+        app_ctx->model_width   = input_attrs[0].dims[0];
+    } else {
+        printf("model is NHWC input fmt\n");
+        app_ctx->model_height  = input_attrs[0].dims[2];
+        app_ctx->model_width   = input_attrs[0].dims[1];
+        app_ctx->model_channel = input_attrs[0].dims[0];
+    }
+    printf("model input height=%d, width=%d, channel=%d\n",
+        app_ctx->model_height, app_ctx->model_width, app_ctx->model_channel);
+
+    return 0;
+}
+
+int release_resnet_model(rknn_app_context_t* app_ctx)
+{
+    if (app_ctx->rknn_ctx != 0) {
+        rknn_destroy(app_ctx->rknn_ctx);
+        app_ctx->rknn_ctx = 0;
+    }
+    if (app_ctx->input_attrs != NULL) {
+        free(app_ctx->input_attrs);
+        app_ctx->input_attrs = NULL;
+    }
+    if (app_ctx->output_attrs != NULL) {
+        free(app_ctx->output_attrs);
+        app_ctx->output_attrs = NULL;
+    }
+    return 0;
+}
+
+int inference_resnet_model(rknn_app_context_t* app_ctx, image_buffer_t* src_img, resnet_result* out_result, int topk)
+{
+    int ret;
+    image_buffer_t img;
+    rknn_input inputs[1];
+    rknn_output outputs[1];
+
+    //defualt initialized
+    memset(&img, 0, sizeof(image_buffer_t));
+    memset(inputs, 0, sizeof(inputs));
+    memset(outputs, 0, sizeof(outputs));
+
+    // Pre Process
+    img.width = app_ctx->model_width;
+    img.height = app_ctx->model_height;
+    img.format = IMAGE_FORMAT_RGB888;
+    img.size = get_image_size(&img);
+    img.virt_addr = (unsigned char*)malloc(img.size);
+    if (img.virt_addr == NULL) {
+        printf("malloc buffer size:%d fail!\n", img.size);
+        return -1;
+    }
+
+    //caution: might have bug!!
+    ret = convert_image(src_img, &img, NULL, NULL, 0);
+    if (ret < 0) {
+        printf("convert_image fail! ret=%d\n", ret);
+        return -1;
+    }
+
+    // Set Input Data
+    inputs[0].index = 0;
+    inputs[0].type  = RKNN_TENSOR_UINT8;
+    inputs[0].fmt   = RKNN_TENSOR_NHWC;
+    inputs[0].size  = app_ctx->model_width * app_ctx->model_height * app_ctx->model_channel;
+    inputs[0].buf   = img.virt_addr;
+
+    ret = rknn_inputs_set(app_ctx->rknn_ctx, 1, inputs);
+    if (ret < 0) {
+        printf("rknn_input_set fail! ret=%d\n", ret);
+        return -1;
+    }
+
+    // Run
+    printf("rknn_run\n");
+    ret = rknn_run(app_ctx->rknn_ctx, nullptr);
+    if (ret < 0) {
+        printf("rknn_run fail! ret=%d\n", ret);
+        return -1;
+    }
+
+    // Get Output
+    outputs[0].want_float = 1;
+    ret = rknn_outputs_get(app_ctx->rknn_ctx, 1, outputs, NULL);
+    if (ret < 0) {
+        printf("rknn_outputs_get fail! ret=%d\n", ret);
+        goto out;
+    }
+
+    // Post Process
+    softmax((float*)outputs[0].buf, app_ctx->output_attrs[0].n_elems);
+
+    get_topk_with_indices((float*)outputs[0].buf, app_ctx->output_attrs[0].n_elems, topk, out_result);
+
+    // Remeber to release rknn output
+    rknn_outputs_release(app_ctx->rknn_ctx, 1, outputs);
+
+out:
+    if (img.virt_addr != NULL) {
+        free(img.virt_addr);
+    }
+
+    return ret;
+}
\ No newline at end of file
diff --git a/examples/resnet/cpp/rknpu2/resnet.cc b/examples/resnet/cpp/rknpu2/resnet.cc
index 20fc052..3f4f64c 100644
--- a/examples/resnet/cpp/rknpu2/resnet.cc
+++ b/examples/resnet/cpp/rknpu2/resnet.cc
@@ -22,13 +22,13 @@ typedef struct {
     int index;
 } element_t;
 
-void swap(element_t* a, element_t* b) {
+static void swap(element_t* a, element_t* b) {
     element_t temp = *a;
     *a = *b;
     *b = temp;
 }
 
-int partition(element_t arr[], int low, int high) {
+static int partition(element_t arr[], int low, int high) {
     float pivot = arr[high].value;
     int i = low - 1;
 
@@ -43,7 +43,7 @@ int partition(element_t arr[], int low, int high) {
     return (i + 1);
 }
 
-void quick_sort(element_t arr[], int low, int high) {
+static void quick_sort(element_t arr[], int low, int high) {
     if (low < high) {
         int pi = partition(arr, low, high);
         quick_sort(arr, low, pi - 1);
@@ -51,7 +51,7 @@ void quick_sort(element_t arr[], int low, int high) {
     }
 }
 
-void softmax(float* array, int size) {
+static void softmax(float* array, int size) {
     // Find the maximum value in the array
     float max_val = array[0];
     for (int i = 1; i < size; i++) {
@@ -78,19 +78,19 @@ void softmax(float* array, int size) {
     }
 }
 
-void get_topk_with_indices(float arr[], int size, int k, resnet_result* result) {
+static void get_topk_with_indices(float arr[], int size, int k, resnet_result* result) {
 
-    // 创建元素数组，保存值和索引号
+    // Create an array of elements, saving values ​​and index numbers
     element_t* elements = (element_t*)malloc(size * sizeof(element_t));
     for (int i = 0; i < size; i++) {
         elements[i].value = arr[i];
         elements[i].index = i;
     }
 
-    // 对元素数组进行快速排序
+    // Quick sort an array of elements
     quick_sort(elements, 0, size - 1);
 
-    // 获取前K个最大值和它们的索引号
+    // Get the top K maximum values ​​and their index numbers
     for (int i = 0; i < k; i++) {
         result[i].score = elements[i].value;
         result[i].cls = elements[i].index;
@@ -184,10 +184,6 @@ int init_resnet_model(const char* model_path, rknn_app_context_t* app_ctx)
 
 int release_resnet_model(rknn_app_context_t* app_ctx)
 {
-    if (app_ctx->rknn_ctx != 0) {
-        rknn_destroy(app_ctx->rknn_ctx);
-        app_ctx->rknn_ctx = 0;
-    }
     if (app_ctx->input_attrs != NULL) {
         free(app_ctx->input_attrs);
         app_ctx->input_attrs = NULL;
@@ -196,6 +192,10 @@ int release_resnet_model(rknn_app_context_t* app_ctx)
         free(app_ctx->output_attrs);
         app_ctx->output_attrs = NULL;
     }
+    if (app_ctx->rknn_ctx != 0) {
+        rknn_destroy(app_ctx->rknn_ctx);
+        app_ctx->rknn_ctx = 0;
+    }
     return 0;
 }
 
diff --git a/examples/resnet/python/resnet.py b/examples/resnet/python/resnet.py
index be40b9c..2d7ae33 100644
--- a/examples/resnet/python/resnet.py
+++ b/examples/resnet/python/resnet.py
@@ -14,6 +14,8 @@
 CLASS_LABEL_PATH = '../model/synset.txt'
 DEFAULT_QUANT = True
 
+RKNPU1_TARGET = ['rk1808', 'rv1109', 'rv1126']
+
 def readable_speed(speed):
     speed_bytes = float(speed)
     speed_kbytes = speed_bytes / 1024
@@ -58,9 +60,10 @@ def check_and_download_origin_model():
 
 def parse_arg():
     if len(sys.argv) < 3:
-        print("Usage: python3 {} [onnx_model_path] [platform] [dtype(optional)] [output_rknn_path(optional)]".format(sys.argv[0]));
-        print("       platform choose from [rk3562,rk3566,rk3568,rk3588]")
-        print("       dtype choose from    [i8, fp]")
+        print("Usage: python3 {} [onnx_model_path] [platform] [dtype(optional)] [output_rknn_path(optional)]".format(sys.argv[0]))
+        print("       platform choose from [rk3562,rk3566,rk3568,rk3588,rk1808,rv1109,rv1126]")
+        print("       dtype choose from [i8, fp] for [rk3562,rk3566,rk3568,rk3588]")
+        print("       dtype choose from [u8, fp] for [rk1808,rv1109,rv1126]")
         exit(1)
 
     model_path = sys.argv[1]
@@ -69,10 +72,10 @@ def parse_arg():
     do_quant = DEFAULT_QUANT
     if len(sys.argv) > 3:
         model_type = sys.argv[3]
-        if model_type not in ['i8', 'fp']:
+        if model_type not in ['u8', 'i8', 'fp']:
             print("ERROR: Invalid model type: {}".format(model_type))
             exit(1)
-        elif model_type == 'i8':
+        elif model_type in ['i8', 'u8']:
             do_quant = True
         else:
             do_quant = False
@@ -96,14 +99,16 @@ def parse_arg():
 
     # Pre-process config
     print('--> Config model')
-    rknn.config(mean_values=[255*0.485, 255*0.456, 255*0.406], std_values=[255*0.229, 255*0.224, 255*0.225], target_platform=platform)
+    rknn.config(mean_values=[[255*0.485, 255*0.456, 255*0.406]], std_values=[[255*0.229, 255*0.224, 255*0.225]], target_platform=platform)
     print('done')
 
     # Load model
     print('--> Loading model')
-    ret = rknn.load_onnx(model=model_path,
-                        inputs=['data'],
-                        input_size_list=[[1, 3, 224, 224]])
+    if platform.lower() in RKNPU1_TARGET:
+        ret = rknn.load_onnx(model=model_path, inputs=['data'], input_size_list=[[3, 224, 224]])
+    else:
+        ret = rknn.load_onnx(model=model_path, inputs=['data'], input_size_list=[[1, 3, 224, 224]])
+
     if ret != 0:
         print('Load model failed!')
         exit(ret)
@@ -129,10 +134,15 @@ def parse_arg():
     img = cv2.imread('../model/dog_224x224.jpg')
     img = cv2.cvtColor(img, cv2.COLOR_BGR2RGB)
     img = cv2.resize(img, (224, 224))
+    img = np.expand_dims(img, 0)
 
     # Init runtime environment
     print('--> Init runtime environment')
-    ret = rknn.init_runtime()
+    if platform.lower() in RKNPU1_TARGET:
+        # For RKNPU1, the simulator has beed disabled since version 1.7.5
+        ret = rknn.init_runtime(target=platform)
+    else:
+        ret = rknn.init_runtime()
     if ret != 0:
         print('Init runtime environment failed!')
         exit(ret)
diff --git a/examples/yolov5/README.md b/examples/yolov5/README.md
index 4c10bc7..642bcf4 100644
--- a/examples/yolov5/README.md
+++ b/examples/yolov5/README.md
@@ -19,6 +19,24 @@ cd model
 ./download_model.sh
 ```
 
+**Note**: The model provided here is an optimized model, which is different from the official original model. Take yolov5n.onnx as an example to show the difference between them.
+1. The comparison of their output information is as follows. The left is the official original model, and the right is the optimized model.
+
+<div align=center>
+  <img src="./model_comparison/yolov5_output_comparison.jpg" alt="Image">
+</div>
+
+2. Taking the output change [1,19200,85]->[1,255,80,80] as an example, we delete a subgraph (the framed part in the picture) from the model and put it in post-processing (this subgraph is not quantification-friendly)
+
+<div align=center>
+  <img src="./model_comparison/yolov5_graph_comparison.jpg" alt="Image">
+</div>
+
+## Current Support Platform
+
+RK3562, RK3566, RK3568, RK3588, RV1103, RV1106, RK1808, RV1109, RV1126
+
+
 
 
 ## Convert to RKNN
@@ -38,7 +56,7 @@ python convert.py ../model/yolov5s_relu.onnx rk3566
 
 - <onnx_model> should be the ONNX model path.
 - <TARGET_PLATFORM>:  Specify NPU platform name. Such as 'rk3588'.
-- <quant_dtype> is *optional*, could be specified as `i8` or `fp`. `i8` means to do quantization, `fp` means no quantization, default is `i8`.
+- <quant_dtype> is *optional*, could be specified as `i8`/`u8` or `fp`. `i8`/`u8` means to do quantization, `fp` means no quantization, default is `i8`.
 - <output_rknn_path> is *optional*, used to specify the saving path of the RKNN model, default save in the same directory as ONNX model with name 'yolov5.rknn'
 
 
@@ -64,27 +82,12 @@ python yolov5.py --model_path <rknn_model> --target <TARGET_PLATFORM> --img_show
 
 ## Android Demo
 
-### Compiling && Building
+**Note: RK1808, RV1109, RV1126 does not support Android.**
 
-```sh
-# go back to the rknn_model_zoo root directory
-cd ../../
-export ANDROID_NDK_PATH=<android_ndk_path>
-
-./build-android.sh -t <TARGET_PLATFORM> -a <ARCH> -d yolov5
-
-# such as 
-./build-android.sh -t rk3588 -a arm64-v8a -d yolov5
-```
-
-- <android_ndk_path>: Specified as Android ndk path.
-- <TARGET_PLATFORM>: Specify NPU platform name. Such as 'rk3588'.
-- <ARCH>: Specify device system architecture.
+### Compiling && Building
 
-```shell
-# Query architecture. For Android, ['arm64-v8a' or 'armeabi-v7a'] shown.
-adb shell cat /proc/version
-```
+Please refer to the [Compilation_Environment_Setup_Guide](../../docs/Compilation_Environment_Setup_Guide.md#android-platform) document to setup a cross-compilation environment and complete the compilation of C/C++ Demo.  
+**Note: Please replace the model name with `yolov5`.**
 
 ### Push demo files to device
 
@@ -118,37 +121,8 @@ export LD_LIBRARY_PATH=./lib
 
 ### Compiling && Building
 
-```shell
-# go back to the rknn_model_zoo root directory
-cd ../../
-
-# if GCC_COMPILER not found while building, please set GCC_COMPILER path
-(optional)export GCC_COMPILER=<GCC_COMPILER_PATH>
-
-./build-linux.sh -t <TARGET_PLATFORM> -a <ARCH> -d yolov5
-
-# such as 
-./build-linux.sh -t rk3588 -a aarch64 -d yolov5
-# such as 
-./build-linux.sh -t rv1106 -a armhf -d yolov5
-```
-
-- <GCC_COMPILER_PATH>: Specified as GCC_COMPILER path.
-
-  - For RV1106, RV1103, GCC_COMPILER version is 'arm-rockchip830-linux-uclibcgnueabihf'
-
-    ```sh
-    export GCC_COMPILER=~/opt/arm-rockchip830-linux-uclibcgnueabihf/bin/arm-rockchip830-linux-uclibcgnueabihf
-    ```
-
-- <TARGET_PLATFORM>: Specify NPU platform name. Such as 'rk3588'.
-
-- <ARCH>: Specify device system architecture.
-
-  ```shell
-  # Query architecture. For Linux, ['aarch64' or 'armhf'] shown.
-  adb shell cat /proc/version
-  ```
+Please refer to the [Compilation_Environment_Setup_Guide](../../docs/Compilation_Environment_Setup_Guide.md#linux-platform) document to setup a cross-compilation environment and complete the compilation of C/C++ Demo.  
+**Note: Please replace the model name with `yolov5`.**
 
 ### Push demo files to device
 
diff --git a/examples/yolov5/cpp/CMakeLists.txt b/examples/yolov5/cpp/CMakeLists.txt
index f55a387..fce90e6 100644
--- a/examples/yolov5/cpp/CMakeLists.txt
+++ b/examples/yolov5/cpp/CMakeLists.txt
@@ -9,12 +9,15 @@ if (ENABLE_ASAN)
 	set (CMAKE_LINKER_FLAGS_DEBUG "${CMAKE_LINKER_FLAGS_DEBUG} -fno-omit-frame-pointer -fsanitize=address")
 endif ()
 
-set(rknpu2_yolov5_file rknpu2/yolov5.cc)
+set(rknpu_yolov5_file rknpu2/yolov5.cc)
 if (TARGET_SOC STREQUAL "rv1106" OR TARGET_SOC STREQUAL "rv1103")
     add_definitions(-DRV1106_1103)
-    set(rknpu2_yolov5_file rknpu2/yolov5_rv1106_1103.cc)
+    set(rknpu_yolov5_file rknpu2/yolov5_rv1106_1103.cc)
     #dma
     include_directories(${CMAKE_CURRENT_SOURCE_DIR}/../../../3rdparty/allocator/dma)
+elseif(TARGET_SOC STREQUAL "rk1808" OR TARGET_SOC STREQUAL "rv1109" OR TARGET_SOC STREQUAL "rv1126")
+    add_definitions(-DRKNPU1)
+    set(rknpu_yolov5_file rknpu1/yolov5.cc)
 endif()
 
 add_subdirectory(${CMAKE_CURRENT_SOURCE_DIR}/../../../3rdparty/ 3rdparty.out)
@@ -28,7 +31,7 @@ file(GLOB SRCS ${CMAKE_CURRENT_SOURCE_DIR}/*.cc)
 add_executable(${PROJECT_NAME}
     main.cc
     postprocess.cc
-    ${rknpu2_yolov5_file}
+    ${rknpu_yolov5_file}
 )
 
 target_link_libraries(${PROJECT_NAME}
@@ -36,6 +39,7 @@ target_link_libraries(${PROJECT_NAME}
     fileutils
     imagedrawing    
     ${LIBRKNNRT}
+    dl
 )
 
 if (CMAKE_SYSTEM_NAME STREQUAL "Android")
diff --git a/examples/yolov5/cpp/main.cc b/examples/yolov5/cpp/main.cc
index 26d2a43..90acf96 100644
--- a/examples/yolov5/cpp/main.cc
+++ b/examples/yolov5/cpp/main.cc
@@ -26,7 +26,7 @@
 #include "image_drawing.h"
 
 #if defined(RV1106_1103) 
-    #include "dma_alloc.cpp"
+    #include "dma_alloc.hpp"
 #endif
 
 /*-------------------------------------------
@@ -68,6 +68,8 @@ int main(int argc, char **argv)
     dma_sync_cpu_to_device(rknn_app_ctx.img_dma_buf.dma_buf_fd);
     free(src_image.virt_addr);
     src_image.virt_addr = (unsigned char *)rknn_app_ctx.img_dma_buf.dma_buf_virt_addr;
+    src_image.fd = rknn_app_ctx.img_dma_buf.dma_buf_fd;
+    rknn_app_ctx.img_dma_buf.size = src_image.size;
 #endif
 
     if (ret != 0)
diff --git a/examples/yolov5/cpp/postprocess.cc b/examples/yolov5/cpp/postprocess.cc
index 8f8d5bb..ce11dc2 100644
--- a/examples/yolov5/cpp/postprocess.cc
+++ b/examples/yolov5/cpp/postprocess.cc
@@ -198,7 +198,72 @@ static int8_t qnt_f32_to_affine(float f32, int32_t zp, float scale)
     return res;
 }
 
+static uint8_t qnt_f32_to_affine_u8(float f32, int32_t zp, float scale)
+{
+    float dst_val = (f32 / scale) + zp;
+    uint8_t res = (uint8_t)__clip(dst_val, 0, 255);
+    return res;
+}
+
 static float deqnt_affine_to_f32(int8_t qnt, int32_t zp, float scale) { return ((float)qnt - (float)zp) * scale; }
+static float deqnt_affine_u8_to_f32(uint8_t qnt, int32_t zp, float scale) { return ((float)qnt - (float)zp) * scale; }
+
+static int process_u8(uint8_t *input, int *anchor, int grid_h, int grid_w, int height, int width, int stride,
+                      std::vector<float> &boxes, std::vector<float> &objProbs, std::vector<int> &classId, float threshold,
+                      int32_t zp, float scale)
+{
+    int validCount = 0;
+    int grid_len = grid_h * grid_w;
+    uint8_t thres_u8 = qnt_f32_to_affine_u8(threshold, zp, scale);
+    for (int a = 0; a < 3; a++)
+    {
+        for (int i = 0; i < grid_h; i++)
+        {
+            for (int j = 0; j < grid_w; j++)
+            {
+                uint8_t box_confidence = input[(PROP_BOX_SIZE * a + 4) * grid_len + i * grid_w + j];
+                if (box_confidence >= thres_u8)
+                {
+                    int offset = (PROP_BOX_SIZE * a) * grid_len + i * grid_w + j;
+                    uint8_t *in_ptr = input + offset;
+                    float box_x = (deqnt_affine_u8_to_f32(*in_ptr, zp, scale)) * 2.0 - 0.5;
+                    float box_y = (deqnt_affine_u8_to_f32(in_ptr[grid_len], zp, scale)) * 2.0 - 0.5;
+                    float box_w = (deqnt_affine_u8_to_f32(in_ptr[2 * grid_len], zp, scale)) * 2.0;
+                    float box_h = (deqnt_affine_u8_to_f32(in_ptr[3 * grid_len], zp, scale)) * 2.0;
+                    box_x = (box_x + j) * (float)stride;
+                    box_y = (box_y + i) * (float)stride;
+                    box_w = box_w * box_w * (float)anchor[a * 2];
+                    box_h = box_h * box_h * (float)anchor[a * 2 + 1];
+                    box_x -= (box_w / 2.0);
+                    box_y -= (box_h / 2.0);
+
+                    uint8_t maxClassProbs = in_ptr[5 * grid_len];
+                    int maxClassId = 0;
+                    for (int k = 1; k < OBJ_CLASS_NUM; ++k)
+                    {
+                        uint8_t prob = in_ptr[(5 + k) * grid_len];
+                        if (prob > maxClassProbs)
+                        {
+                            maxClassId = k;
+                            maxClassProbs = prob;
+                        }
+                    }
+                    if (maxClassProbs > thres_u8)
+                    {
+                        objProbs.push_back((deqnt_affine_u8_to_f32(maxClassProbs, zp, scale)) * (deqnt_affine_u8_to_f32(box_confidence, zp, scale)));
+                        classId.push_back(maxClassId);
+                        validCount++;
+                        boxes.push_back(box_x);
+                        boxes.push_back(box_y);
+                        boxes.push_back(box_w);
+                        boxes.push_back(box_h);
+                    }
+                }
+            }
+        }
+    }
+    return validCount;
+}
 
 static int process_i8(int8_t *input, int *anchor, int grid_h, int grid_w, int height, int width, int stride,
                       std::vector<float> &boxes, std::vector<float> &objProbs, std::vector<int> &classId, float threshold,
@@ -399,21 +464,36 @@ int post_process(rknn_app_context_t *app_ctx, void *outputs, letterbox_t *letter
 
     for (int i = 0; i < 3; i++)
     {
-        
+
 #if defined(RV1106_1103) 
-        grid_h = app_ctx->output_attrs[i].dims[2];
-        grid_w = app_ctx->output_attrs[i].dims[1];
+        grid_h = app_ctx->output_attrs[i].dims[1];
+        grid_w = app_ctx->output_attrs[i].dims[2];
         stride = model_in_h / grid_h;
         //RV1106 only support i8
         if (app_ctx->is_quant) {
             validCount += process_i8_rv1106((int8_t *)(_outputs[i]->virt_addr), (int *)anchor[i], grid_h, grid_w, model_in_h, model_in_w, stride, filterBoxes, objProbs,
                                      classId, conf_threshold, app_ctx->output_attrs[i].zp, app_ctx->output_attrs[i].scale);
         }
-#else     
+#elif defined(RKNPU1)
+        // NCHW reversed: WHCN
+        grid_h = app_ctx->output_attrs[i].dims[1];
+        grid_w = app_ctx->output_attrs[i].dims[0];
+        stride = model_in_h / grid_h;
+        if (app_ctx->is_quant)
+        {
+            validCount += process_u8((uint8_t *)_outputs[i].buf, (int *)anchor[i], grid_h, grid_w, model_in_h, model_in_w, stride, filterBoxes, objProbs,
+                                     classId, conf_threshold, app_ctx->output_attrs[i].zp, app_ctx->output_attrs[i].scale);
+        }
+        else
+        {
+            validCount += process_fp32((float *)_outputs[i].buf, (int *)anchor[i], grid_h, grid_w, model_in_h, model_in_w, stride, filterBoxes, objProbs,
+                                       classId, conf_threshold);
+        }
+#else
         grid_h = app_ctx->output_attrs[i].dims[2];
         grid_w = app_ctx->output_attrs[i].dims[3];
         stride = model_in_h / grid_h;
-         if (app_ctx->is_quant)
+        if (app_ctx->is_quant)
         {
             validCount += process_i8((int8_t *)_outputs[i].buf, (int *)anchor[i], grid_h, grid_w, model_in_h, model_in_w, stride, filterBoxes, objProbs,
                                      classId, conf_threshold, app_ctx->output_attrs[i].zp, app_ctx->output_attrs[i].scale);
diff --git a/examples/yolov5/cpp/rknpu1/yolov5.cc b/examples/yolov5/cpp/rknpu1/yolov5.cc
new file mode 100644
index 0000000..b3ce8fc
--- /dev/null
+++ b/examples/yolov5/cpp/rknpu1/yolov5.cc
@@ -0,0 +1,251 @@
+// Copyright (c) 2023 by Rockchip Electronics Co., Ltd. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+#include <math.h>
+
+#include "yolov5.h"
+#include "common.h"
+#include "file_utils.h"
+#include "image_utils.h"
+
+static void dump_tensor_attr(rknn_tensor_attr *attr)
+{
+    printf("  index=%d, name=%s, n_dims=%d, dims=[%d, %d, %d, %d], n_elems=%d, size=%d, fmt=%s, type=%s, qnt_type=%s, "
+           "zp=%d, scale=%f\n",
+           attr->index, attr->name, attr->n_dims, attr->dims[3], attr->dims[2], attr->dims[1], attr->dims[0],
+           attr->n_elems, attr->size, get_format_string(attr->fmt), get_type_string(attr->type),
+           get_qnt_type_string(attr->qnt_type), attr->zp, attr->scale);
+}
+
+int init_yolov5_model(const char *model_path, rknn_app_context_t *app_ctx)
+{
+    int ret;
+    int model_len = 0;
+    char *model;
+    rknn_context ctx = 0;
+
+    // Load RKNN Model
+    model_len = read_data_from_file(model_path, &model);
+    if (model == NULL)
+    {
+        printf("load_model fail!\n");
+        return -1;
+    }
+
+    ret = rknn_init(&ctx, model, model_len, 0);
+    free(model);
+    if (ret < 0)
+    {
+        printf("rknn_init fail! ret=%d\n", ret);
+        return -1;
+    }
+
+    // Get Model Input Output Number
+    rknn_input_output_num io_num;
+    ret = rknn_query(ctx, RKNN_QUERY_IN_OUT_NUM, &io_num, sizeof(io_num));
+    if (ret != RKNN_SUCC)
+    {
+        printf("rknn_query fail! ret=%d\n", ret);
+        return -1;
+    }
+    printf("model input num: %d, output num: %d\n", io_num.n_input, io_num.n_output);
+
+    // Get Model Input Info
+    printf("input tensors:\n");
+    rknn_tensor_attr input_attrs[io_num.n_input];
+    memset(input_attrs, 0, sizeof(input_attrs));
+    for (int i = 0; i < io_num.n_input; i++)
+    {
+        input_attrs[i].index = i;
+        ret = rknn_query(ctx, RKNN_QUERY_INPUT_ATTR, &(input_attrs[i]), sizeof(rknn_tensor_attr));
+        if (ret != RKNN_SUCC)
+        {
+            printf("rknn_query fail! ret=%d\n", ret);
+            return -1;
+        }
+        dump_tensor_attr(&(input_attrs[i]));
+    }
+
+    // Get Model Output Info
+    printf("output tensors:\n");
+    rknn_tensor_attr output_attrs[io_num.n_output];
+    memset(output_attrs, 0, sizeof(output_attrs));
+    for (int i = 0; i < io_num.n_output; i++)
+    {
+        output_attrs[i].index = i;
+        ret = rknn_query(ctx, RKNN_QUERY_OUTPUT_ATTR, &(output_attrs[i]), sizeof(rknn_tensor_attr));
+        if (ret != RKNN_SUCC)
+        {
+            printf("rknn_query fail! ret=%d\n", ret);
+            return -1;
+        }
+        dump_tensor_attr(&(output_attrs[i]));
+    }
+
+    // Set to context
+    app_ctx->rknn_ctx = ctx;
+
+    // TODO
+    if (output_attrs[0].qnt_type == RKNN_TENSOR_QNT_AFFINE_ASYMMETRIC && output_attrs[0].type != RKNN_TENSOR_FLOAT16)
+    {
+        app_ctx->is_quant = true;
+    }
+    else
+    {
+        app_ctx->is_quant = false;
+    }
+
+    app_ctx->io_num = io_num;
+    app_ctx->input_attrs = (rknn_tensor_attr *)malloc(io_num.n_input * sizeof(rknn_tensor_attr));
+    memcpy(app_ctx->input_attrs, input_attrs, io_num.n_input * sizeof(rknn_tensor_attr));
+    app_ctx->output_attrs = (rknn_tensor_attr *)malloc(io_num.n_output * sizeof(rknn_tensor_attr));
+    memcpy(app_ctx->output_attrs, output_attrs, io_num.n_output * sizeof(rknn_tensor_attr));
+
+    if (input_attrs[0].fmt == RKNN_TENSOR_NCHW)
+    {
+        printf("model is NCHW input fmt\n");
+        app_ctx->model_channel = input_attrs[0].dims[2];
+        app_ctx->model_height = input_attrs[0].dims[1];
+        app_ctx->model_width = input_attrs[0].dims[0];
+    }
+    else
+    {
+        printf("model is NHWC input fmt\n");
+        app_ctx->model_height = input_attrs[0].dims[2];
+        app_ctx->model_width = input_attrs[0].dims[1];
+        app_ctx->model_channel = input_attrs[0].dims[0];
+    }
+    printf("model input height=%d, width=%d, channel=%d\n",
+           app_ctx->model_height, app_ctx->model_width, app_ctx->model_channel);
+
+    return 0;
+}
+
+int release_yolov5_model(rknn_app_context_t *app_ctx)
+{
+    if (app_ctx->input_attrs != NULL)
+    {
+        free(app_ctx->input_attrs);
+        app_ctx->input_attrs = NULL;
+    }
+    if (app_ctx->output_attrs != NULL)
+    {
+        free(app_ctx->output_attrs);
+        app_ctx->output_attrs = NULL;
+    }
+    if (app_ctx->rknn_ctx != 0)
+    {
+        rknn_destroy(app_ctx->rknn_ctx);
+        app_ctx->rknn_ctx = 0;
+    }
+    return 0;
+}
+
+int inference_yolov5_model(rknn_app_context_t *app_ctx, image_buffer_t *img, object_detect_result_list *od_results)
+{
+    int ret;
+    image_buffer_t dst_img;
+    letterbox_t letter_box;
+    rknn_input inputs[app_ctx->io_num.n_input];
+    rknn_output outputs[app_ctx->io_num.n_output];
+    const float nms_threshold = NMS_THRESH;      // Default NMS threshold
+    const float box_conf_threshold = BOX_THRESH; // Default box threshold
+    int bg_color = 114;
+
+    if ((!app_ctx) || !(img) || (!od_results))
+    {
+        return -1;
+    }
+
+    memset(od_results, 0x00, sizeof(*od_results));
+    memset(&letter_box, 0, sizeof(letterbox_t));
+    memset(&dst_img, 0, sizeof(image_buffer_t));
+    memset(inputs, 0, sizeof(inputs));
+    memset(outputs, 0, sizeof(outputs));
+
+    // Pre Process
+    dst_img.width = app_ctx->model_width;
+    dst_img.height = app_ctx->model_height;
+    dst_img.format = IMAGE_FORMAT_RGB888;
+    dst_img.size = get_image_size(&dst_img);
+    dst_img.virt_addr = (unsigned char *)malloc(dst_img.size);
+    if (dst_img.virt_addr == NULL)
+    {
+        printf("malloc buffer size:%d fail!\n", dst_img.size);
+        return -1;
+    }
+
+    // letterbox
+    ret = convert_image_with_letterbox(img, &dst_img, &letter_box, bg_color);
+    if (ret < 0)
+    {
+        printf("convert_image_with_letterbox fail! ret=%d\n", ret);
+        return -1;
+    }
+
+    // Set Input Data
+    inputs[0].index = 0;
+    inputs[0].type = RKNN_TENSOR_UINT8;
+    inputs[0].fmt = RKNN_TENSOR_NHWC;
+    inputs[0].size = app_ctx->model_width * app_ctx->model_height * app_ctx->model_channel;
+    inputs[0].buf = dst_img.virt_addr;
+    // inputs[0].buf = img->virt_addr;
+
+    ret = rknn_inputs_set(app_ctx->rknn_ctx, app_ctx->io_num.n_input, inputs);
+    if (ret < 0)
+    {
+        printf("rknn_input_set fail! ret=%d\n", ret);
+        return -1;
+    }
+
+    // Run
+    printf("rknn_run\n");
+    ret = rknn_run(app_ctx->rknn_ctx, nullptr);
+    if (ret < 0)
+    {
+        printf("rknn_run fail! ret=%d\n", ret);
+        return -1;
+    }
+
+    // Get Output
+    memset(outputs, 0, sizeof(outputs));
+    for (int i = 0; i < app_ctx->io_num.n_output; i++)
+    {
+        outputs[i].index = i;
+        outputs[i].want_float = (!app_ctx->is_quant);
+    }
+    ret = rknn_outputs_get(app_ctx->rknn_ctx, app_ctx->io_num.n_output, outputs, NULL);
+    if (ret < 0)
+    {
+        printf("rknn_outputs_get fail! ret=%d\n", ret);
+        goto out;
+    }
+
+    // Post Process
+    post_process(app_ctx, outputs, &letter_box, box_conf_threshold, nms_threshold, od_results);
+
+    // Remeber to release rknn output
+    rknn_outputs_release(app_ctx->rknn_ctx, app_ctx->io_num.n_output, outputs);
+
+out:
+    if (dst_img.virt_addr != NULL)
+    {
+        free(dst_img.virt_addr);
+    }
+
+    return ret;
+}
\ No newline at end of file
diff --git a/examples/yolov5/cpp/rknpu2/yolov5.cc b/examples/yolov5/cpp/rknpu2/yolov5.cc
index 02c1fcf..bc9b562 100644
--- a/examples/yolov5/cpp/rknpu2/yolov5.cc
+++ b/examples/yolov5/cpp/rknpu2/yolov5.cc
@@ -137,11 +137,6 @@ int init_yolov5_model(const char *model_path, rknn_app_context_t *app_ctx)
 
 int release_yolov5_model(rknn_app_context_t *app_ctx)
 {
-    if (app_ctx->rknn_ctx != 0)
-    {
-        rknn_destroy(app_ctx->rknn_ctx);
-        app_ctx->rknn_ctx = 0;
-    }
     if (app_ctx->input_attrs != NULL)
     {
         free(app_ctx->input_attrs);
@@ -152,6 +147,11 @@ int release_yolov5_model(rknn_app_context_t *app_ctx)
         free(app_ctx->output_attrs);
         app_ctx->output_attrs = NULL;
     }
+    if (app_ctx->rknn_ctx != 0)
+    {
+        rknn_destroy(app_ctx->rknn_ctx);
+        app_ctx->rknn_ctx = 0;
+    }
     return 0;
 }
 
@@ -162,8 +162,8 @@ int inference_yolov5_model(rknn_app_context_t *app_ctx, image_buffer_t *img, obj
     letterbox_t letter_box;
     rknn_input inputs[app_ctx->io_num.n_input];
     rknn_output outputs[app_ctx->io_num.n_output];
-    const float nms_threshold = NMS_THRESH;      // 默认的NMS阈值
-    const float box_conf_threshold = BOX_THRESH; // 默认的置信度阈值
+    const float nms_threshold = NMS_THRESH;      // Default NMS threshold
+    const float box_conf_threshold = BOX_THRESH; // Default box threshold
     int bg_color = 114;
 
     if ((!app_ctx) || !(img) || (!od_results))
diff --git a/examples/yolov5/cpp/rknpu2/yolov5_rv1106_1103.cc b/examples/yolov5/cpp/rknpu2/yolov5_rv1106_1103.cc
index 7e7f4d2..090b631 100644
--- a/examples/yolov5/cpp/rknpu2/yolov5_rv1106_1103.cc
+++ b/examples/yolov5/cpp/rknpu2/yolov5_rv1106_1103.cc
@@ -154,11 +154,6 @@ int init_yolov5_model(const char *model_path, rknn_app_context_t *app_ctx)
 
 int release_yolov5_model(rknn_app_context_t *app_ctx)
 {
-    if (app_ctx->rknn_ctx != 0)
-    {
-        rknn_destroy(app_ctx->rknn_ctx);
-        app_ctx->rknn_ctx = 0;
-    }
     if (app_ctx->input_attrs != NULL)
     {
         free(app_ctx->input_attrs);
@@ -172,15 +167,18 @@ int release_yolov5_model(rknn_app_context_t *app_ctx)
     for (int i = 0; i < app_ctx->io_num.n_input; i++) {
         if (app_ctx->input_mems[i] != NULL) {
             rknn_destroy_mem(app_ctx->rknn_ctx, app_ctx->input_mems[i]);
-            free(app_ctx->input_mems[i]);
         }
     }
     for (int i = 0; i < app_ctx->io_num.n_output; i++) {
         if (app_ctx->output_mems[i] != NULL) {
             rknn_destroy_mem(app_ctx->rknn_ctx, app_ctx->output_mems[i]);
-            free(app_ctx->output_mems[i]);
         }
     }
+    if (app_ctx->rknn_ctx != 0)
+    {
+        rknn_destroy(app_ctx->rknn_ctx);
+        app_ctx->rknn_ctx = 0;
+    }
     return 0;
 }
 
@@ -189,8 +187,8 @@ int inference_yolov5_model(rknn_app_context_t *app_ctx, image_buffer_t *img, obj
     int ret;
     image_buffer_t dst_img;
     letterbox_t letter_box;
-    const float nms_threshold = NMS_THRESH;      // 默认的NMS阈值
-    const float box_conf_threshold = BOX_THRESH; // 默认的置信度阈值
+    const float nms_threshold = NMS_THRESH;      // Default NMS threshold
+    const float box_conf_threshold = BOX_THRESH; // Default box threshold
     int bg_color = 114;
     
     if ((!app_ctx) || !(img) || (!od_results))
@@ -206,8 +204,8 @@ int inference_yolov5_model(rknn_app_context_t *app_ctx, image_buffer_t *img, obj
     dst_img.height = app_ctx->model_height;
     dst_img.format = IMAGE_FORMAT_RGB888;
     dst_img.size = get_image_size(&dst_img);
-    dst_img.virt_addr = (unsigned char *)app_ctx->input_mems[0]->virt_addr;
-    if (dst_img.virt_addr == NULL)
+    dst_img.fd = app_ctx->input_mems[0]->fd;
+    if (dst_img.virt_addr == NULL && dst_img.fd == 0)
     {
         printf("malloc buffer size:%d fail!\n", dst_img.size);
         return -1;
diff --git a/examples/yolov5/model_comparison/yolov5_graph_comparison.jpg b/examples/yolov5/model_comparison/yolov5_graph_comparison.jpg
new file mode 100644
index 0000000..7a2c128
Binary files /dev/null and b/examples/yolov5/model_comparison/yolov5_graph_comparison.jpg differ
diff --git a/examples/yolov5/model_comparison/yolov5_output_comparison.jpg b/examples/yolov5/model_comparison/yolov5_output_comparison.jpg
new file mode 100644
index 0000000..bdc3a68
Binary files /dev/null and b/examples/yolov5/model_comparison/yolov5_output_comparison.jpg differ
diff --git a/examples/yolov5/python/convert.py b/examples/yolov5/python/convert.py
index 71552bf..0adb66e 100644
--- a/examples/yolov5/python/convert.py
+++ b/examples/yolov5/python/convert.py
@@ -8,9 +8,10 @@
 
 def parse_arg():
     if len(sys.argv) < 3:
-        print("Usage: python3 {} onnx_model_path [platform] [dtype(optional)] [output_rknn_path(optional)]".format(sys.argv[0]));
-        print("       platform choose from [rk3562,rk3566,rk3568,rk3588]")
-        print("       dtype choose from    [i8, fp]")
+        print("Usage: python3 {} onnx_model_path [platform] [dtype(optional)] [output_rknn_path(optional)]".format(sys.argv[0]))
+        print("       platform choose from [rk3562,rk3566,rk3568,rk3588,rk1808,rv1109,rv1126]")
+        print("       dtype choose from [i8, fp] for [rk3562,rk3566,rk3568,rk3588]")
+        print("       dtype choose from [u8, fp] for [rk1808,rv1109,rv1126]")
         exit(1)
 
     model_path = sys.argv[1]
@@ -19,10 +20,10 @@ def parse_arg():
     do_quant = DEFAULT_QUANT
     if len(sys.argv) > 3:
         model_type = sys.argv[3]
-        if model_type not in ['i8', 'fp']:
+        if model_type not in ['i8', 'u8', 'fp']:
             print("ERROR: Invalid model type: {}".format(model_type))
             exit(1)
-        elif model_type == 'i8':
+        elif model_type in ['i8', 'u8']:
             do_quant = True
         else:
             do_quant = False
diff --git a/examples/yolov5_seg/README.md b/examples/yolov5_seg/README.md
index b01e447..5942b94 100644
--- a/examples/yolov5_seg/README.md
+++ b/examples/yolov5_seg/README.md
@@ -1,5 +1,9 @@
 # yolov5_seg
 
+## Current Support Platform
+RK3566, RK3568, RK3588, RK3562, RK1808, RV1109, RV1126
+
+
 ## Model Source
 The model used in this example comes from the following open source projects:  
 https://github.com/airockchip/yolov5
@@ -15,6 +19,18 @@ cd model
 ./download_model.sh
 ```
 
+**Note**: The model provided here is an optimized model, which is different from the official original model. Take yolov5n-seg.onnx as an example to show the difference between them.
+1. The comparison of their output information is as follows. The left is the official original model, and the right is the optimized model. The three colored boxes in the figure represent the changes in the three outputs.
+
+<div align=center>
+  <img src="./model_comparison/yolov5_seg_output_comparison.jpg" alt="Image">
+</div>
+
+2. Taking the output change [1,19200,117] -> ([1,255,80,80],[1,96,80,80]) as an example, we split the convolution of [351x64x1x1] into [255x64x1x1] and [96x64x1x1], then we remove the subsequent subgraphs from the model (the framed part in the figure) and put them into post-processing (these subgraphs are not quantification-friendly)
+
+<div align=center>
+  <img src="./model_comparison/yolov5_seg_graph_comparison.jpg" alt="Image">
+</div>
 
 
 ## Model Convert
@@ -25,14 +41,14 @@ cd model
 cd python
 python convert.py <onnx_model> <TARGET_PLATFORM> <dtype(optional)> <output_rknn_path(optional)>
 # such as: python convert.py ../model/yolov5s-seg.onnx rk3566
-# output model will be saved as ../model/yolov5s-seg.rknn
+# output model will be saved as ../model/yolov5_seg.rknn
 ```
 
 *Description:*
 
 - <onnx_model> should be the ONNX model path.
-- <TARGET_PLATFORM>  could be specified as RK3562, RK3566, RK3568, RK3588 according to board SOC version.
-- <dtype\> is *optional*, could be specified as `i8` or `fp`, `i8` means to do quantization, `fp` means no to do quantization, default is `i8`.
+- <TARGET_PLATFORM>  could be specified as RK3562, RK3566, RK3568, RK3588, RK1808, RV1109, RV1126 according to board SOC version.
+- <dtype> is *optional*, could be specified as `i8`, `u8` or `fp`, `i8`/`u8` means to do quantization, `fp` means no to do quantization, default is `i8`/`u8`.
 - <output_rknn_path> is *optional*, used to specify the saving path of the RKNN model.
 
 
@@ -54,7 +70,7 @@ python yolov5_seg.py --model_path {rknn_model} --target {target_platform} --anno
 ```
 *Description:*
 - {onnx_model / rknn_model} should be the model path.
-- {target_platform} could be filled like [RK3566, RK3568, RK3588, RK3562]
+- {target_platform} could be filled like [RK3566, RK3568, RK3588, RK3562, RK1808, RV1109, RV1126]
 - {val_annotation} is the path of COCO val annotation.
 - {val_dataset} is the path of COCO val images.
 
@@ -64,24 +80,12 @@ Note: **For more usage, please execute command `python yolov5_seg.py --help.`**
 
 
 ## Android Demo
+**Note: RK1808, RV1109, RV1126 does not support Android.**
 
 ### Compiling && Building
 
-Modify the path of Android NDK in 'build-android.sh'.
-
-For example,
-
-```sh
-ANDROID_NDK_PATH=~/opt/toolchain/android-ndk-r19c
-```
-
-Then, run this script:
-
-```sh
-./build-android.sh -t <TARGET_PLATFORM> -a arm64-v8a -d yolov5_seg
-```
-
-Please use the specific platform instead of <TARGET_PLATFORM> above.
+Please refer to the [Compilation_Environment_Setup_Guide](../../docs/Compilation_Environment_Setup_Guide.md#android-platform) document to setup a cross-compilation environment and complete the compilation of C/C++ Demo.  
+**Note: Please replace the model name with `yolov5_seg`.**
 
 ### Push all build output file to the board
 
@@ -109,19 +113,8 @@ export LD_LIBRARY_PATH=./lib
 
 ### Compiling && Building
 
-According to the target platform, modify the path of 'GCC_COMPILER' in 'build-linux.sh'.
-
-```sh
-export GCC_COMPILER=/opt/tools/prebuilts/gcc/linux-x86/aarch64/gcc-linaro-6.3.1-2017.05-x86_64_aarch64-linux-gnu/bin/aarch64-linux-gnu
-```
-
-Then, run the script:
-
-```sh
-./build-linux.sh  -t <TARGET_PLATFORM> -a aarch64 -d yolov5_seg
-```
-
-Please use the specific platform instead of <TARGET_PLATFORM> above.
+Please refer to the [Compilation_Environment_Setup_Guide](../../docs/Compilation_Environment_Setup_Guide.md#linux-platform) document to setup a cross-compilation environment and complete the compilation of C/C++ Demo.  
+**Note: Please replace the model name with `yolov5_seg`.**
 
 ### Push all build output file to the board
 
diff --git a/examples/yolov5_seg/cpp/CMakeLists.txt b/examples/yolov5_seg/cpp/CMakeLists.txt
index 6dc4de8..54dd48a 100644
--- a/examples/yolov5_seg/cpp/CMakeLists.txt
+++ b/examples/yolov5_seg/cpp/CMakeLists.txt
@@ -23,14 +23,12 @@ endif()
 if (CMAKE_SYSTEM_NAME STREQUAL "Android")
   set(OpenCV_DIR ${CMAKE_CURRENT_SOURCE_DIR}/../../../3rdparty/opencv/opencv-android-sdk-build/sdk/native/jni/abi-${CMAKE_ANDROID_ARCH_ABI})
 else()
-    if (TARGET_SOC STREQUAL "x86-64")
-      # set(OpenCV_DIR ${MZ_ROOT}/libs/common/opencv/opencv-linux-x86_64/share/OpenCV)
-    elseif(TARGET_LIB_ARCH STREQUAL "lib")
+    if(TARGET_LIB_ARCH STREQUAL "lib")
       set(OpenCV_DIR ${CMAKE_CURRENT_SOURCE_DIR}/../../../3rdparty/opencv/opencv-linux-armhf/share/OpenCV)
     else()
       set(OpenCV_DIR ${CMAKE_CURRENT_SOURCE_DIR}/../../../3rdparty/opencv/opencv-linux-aarch64/share/OpenCV)
     endif()
-  endif()
+endif()
 find_package(OpenCV REQUIRED)
 message(STATUS OpenCV_DIR=${OpenCV_DIR})
 message(STATUS OpenCV_LIBS=${OpenCV_LIBS})
@@ -41,23 +39,41 @@ file(GLOB SRCS ${CMAKE_CURRENT_SOURCE_DIR}/*.cc)
 
 #dma
 include_directories(${CMAKE_CURRENT_SOURCE_DIR}/../../../3rdparty/allocator/dma)
-
 #drm
 include_directories(${CMAKE_CURRENT_SOURCE_DIR}/../../../3rdparty/allocator/drm)
 
+if (TARGET_SOC STREQUAL "rk1808" OR TARGET_SOC STREQUAL "rv1109" OR TARGET_SOC STREQUAL "rv1126")
+    set(postprocess_file rknpu1/postprocess.cc)
+    set(yolov5_seg_file rknpu1/yolov5_seg.cc)
+    #matmul
+else()
+    set(postprocess_file rknpu2/postprocess.cc)
+    set(yolov5_seg_file rknpu2/yolov5_seg.cc)
+endif()
+
 add_executable(${PROJECT_NAME}
     main.cc
-    postprocess.cc
-    rknpu2/yolov5_seg.cc
+    ${postprocess_file}
+    ${yolov5_seg_file}
 )
 
-target_link_libraries(${PROJECT_NAME}
-    fileutils
-    imageutils
-    imagedrawing
-    ${OpenCV_LIBS}    
-    ${LIBRKNNRT}
-)
+if (TARGET_SOC STREQUAL "rk1808" OR TARGET_SOC STREQUAL "rv1109" OR TARGET_SOC STREQUAL "rv1126")
+  target_link_libraries(${PROJECT_NAME}
+      fileutils
+      imageutils
+      imagedrawing
+      ${OpenCV_LIBS}    
+      ${LIBRKNNRT}
+  )
+else()
+  target_link_libraries(${PROJECT_NAME}
+      fileutils
+      imageutils
+      imagedrawing
+      ${OpenCV_LIBS}    
+      ${LIBRKNNRT}
+  )
+  endif()
 
 if (CMAKE_SYSTEM_NAME STREQUAL "Android")
     target_link_libraries(${PROJECT_NAME}
diff --git a/examples/yolov5_seg/cpp/easy_timer.h b/examples/yolov5_seg/cpp/easy_timer.h
new file mode 100644
index 0000000..755c226
--- /dev/null
+++ b/examples/yolov5_seg/cpp/easy_timer.h
@@ -0,0 +1,43 @@
+#include <sys/time.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+
+class TIMER{
+    private:
+        struct timeval start_time, stop_time;
+        double __get_us(struct timeval t) { return (t.tv_sec * 1000000 + t.tv_usec); }
+        char indent[40];
+
+    public:
+        TIMER(){}
+        ~TIMER(){}
+
+        void indent_set(char* s){
+            strcpy(indent, s);
+        }
+        void indent_set(const char* s){
+            strcpy(indent, s);
+        }
+
+        void tik(){
+            gettimeofday(&start_time, NULL);
+        }
+
+        void tok(){
+            gettimeofday(&stop_time, NULL);
+        }
+
+        void print_time(char* str){
+            printf("%s", indent);
+            printf("%s use: %f ms\n", str, get_time());
+        }
+        void print_time(const char* str){
+            printf("%s", indent);
+            printf("%s use: %f ms\n", str, get_time());
+        }
+
+        float get_time(){
+            return (__get_us(stop_time) - __get_us(start_time))/1000;
+        }
+};
\ No newline at end of file
diff --git a/examples/yolov5_seg/cpp/postprocess.h b/examples/yolov5_seg/cpp/postprocess.h
index cb9cb8d..b3ddec0 100644
--- a/examples/yolov5_seg/cpp/postprocess.h
+++ b/examples/yolov5_seg/cpp/postprocess.h
@@ -14,6 +14,10 @@
 #define BOX_THRESH 0.25
 #define PROP_BOX_SIZE (5 + OBJ_CLASS_NUM)
 
+#define PROTO_CHANNEL 32
+#define PROTO_HEIGHT 160
+#define PROTO_WEIGHT 160
+
 // class rknn_app_context_t;
 
 typedef struct
diff --git a/examples/yolov5_seg/cpp/rknpu1/postprocess.cc b/examples/yolov5_seg/cpp/rknpu1/postprocess.cc
new file mode 100644
index 0000000..2afbbd1
--- /dev/null
+++ b/examples/yolov5_seg/cpp/rknpu1/postprocess.cc
@@ -0,0 +1,780 @@
+// Copyright (c) 2021 by Rockchip Electronics Co., Ltd. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "yolov5_seg.h"
+
+#include <math.h>
+#include <stdint.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+#include <sys/time.h>
+#include <opencv2/opencv.hpp>
+#include "easy_timer.h"
+
+#include <set>
+#include <vector>
+#define LABEL_NALE_TXT_PATH "./model/coco_80_labels_list.txt"
+// #define USE_FP_RESIZE
+
+static char *labels[OBJ_CLASS_NUM];
+
+const int anchor[3][6] = {{10, 13, 16, 30, 33, 23},
+                          {30, 61, 62, 45, 59, 119},
+                          {116, 90, 156, 198, 373, 326}};
+
+int clamp(float val, int min, int max)
+{
+    return val > min ? (val < max ? val : max) : min;
+}
+
+static char *readLine(FILE *fp, char *buffer, int *len)
+{
+    int ch;
+    int i = 0;
+    size_t buff_len = 0;
+
+    buffer = (char *)malloc(buff_len + 1);
+    if (!buffer)
+        return NULL; // Out of memory
+
+    while ((ch = fgetc(fp)) != '\n' && ch != EOF)
+    {
+        buff_len++;
+        void *tmp = realloc(buffer, buff_len + 1);
+        if (tmp == NULL)
+        {
+            free(buffer);
+            return NULL; // Out of memory
+        }
+        buffer = (char *)tmp;
+
+        buffer[i] = (char)ch;
+        i++;
+    }
+    buffer[i] = '\0';
+
+    *len = buff_len;
+
+    // Detect end
+    if (ch == EOF && (i == 0 || ferror(fp)))
+    {
+        free(buffer);
+        return NULL;
+    }
+    return buffer;
+}
+
+static int readLines(const char *fileName, char *lines[], int max_line)
+{
+    FILE *file = fopen(fileName, "r");
+    char *s;
+    int i = 0;
+    int n = 0;
+
+    if (file == NULL)
+    {
+        printf("Open %s fail!\n", fileName);
+        return -1;
+    }
+
+    while ((s = readLine(file, s, &n)) != NULL)
+    {
+        lines[i++] = s;
+        if (i >= max_line)
+            break;
+    }
+    fclose(file);
+    return i;
+}
+
+static int loadLabelName(const char *locationFilename, char *label[])
+{
+    printf("load lable %s\n", locationFilename);
+    readLines(locationFilename, label, OBJ_CLASS_NUM);
+    return 0;
+}
+
+static float CalculateOverlap(float xmin0, float ymin0, float xmax0, float ymax0, float xmin1, float ymin1, float xmax1,
+                              float ymax1)
+{
+    float w = fmax(0.f, fmin(xmax0, xmax1) - fmax(xmin0, xmin1) + 1.0);
+    float h = fmax(0.f, fmin(ymax0, ymax1) - fmax(ymin0, ymin1) + 1.0);
+    float i = w * h;
+    float u = (xmax0 - xmin0 + 1.0) * (ymax0 - ymin0 + 1.0) + (xmax1 - xmin1 + 1.0) * (ymax1 - ymin1 + 1.0) - i;
+    return u <= 0.f ? 0.f : (i / u);
+}
+
+static int nms(int validCount, std::vector<float> &outputLocations, std::vector<int> classIds, std::vector<int> &order,
+               int filterId, float threshold)
+{
+    for (int i = 0; i < validCount; ++i)
+    {
+        if (order[i] == -1 || classIds[i] != filterId)
+        {
+            continue;
+        }
+        int n = order[i];
+        for (int j = i + 1; j < validCount; ++j)
+        {
+            int m = order[j];
+            if (m == -1 || classIds[i] != filterId)
+            {
+                continue;
+            }
+            float xmin0 = outputLocations[n * 4 + 0];
+            float ymin0 = outputLocations[n * 4 + 1];
+            float xmax0 = outputLocations[n * 4 + 0] + outputLocations[n * 4 + 2];
+            float ymax0 = outputLocations[n * 4 + 1] + outputLocations[n * 4 + 3];
+
+            float xmin1 = outputLocations[m * 4 + 0];
+            float ymin1 = outputLocations[m * 4 + 1];
+            float xmax1 = outputLocations[m * 4 + 0] + outputLocations[m * 4 + 2];
+            float ymax1 = outputLocations[m * 4 + 1] + outputLocations[m * 4 + 3];
+
+            float iou = CalculateOverlap(xmin0, ymin0, xmax0, ymax0, xmin1, ymin1, xmax1, ymax1);
+
+            if (iou > threshold)
+            {
+                order[j] = -1;
+            }
+        }
+    }
+    return 0;
+}
+
+static int quick_sort_indice_inverse(std::vector<float> &input, int left, int right, std::vector<int> &indices)
+{
+    float key;
+    int key_index;
+    int low = left;
+    int high = right;
+    if (left < right)
+    {
+        key_index = indices[left];
+        key = input[left];
+        while (low < high)
+        {
+            while (low < high && input[high] <= key)
+            {
+                high--;
+            }
+            input[low] = input[high];
+            indices[low] = indices[high];
+            while (low < high && input[low] >= key)
+            {
+                low++;
+            }
+            input[high] = input[low];
+            indices[high] = indices[low];
+        }
+        input[low] = key;
+        indices[low] = key_index;
+        quick_sort_indice_inverse(input, left, low - 1, indices);
+        quick_sort_indice_inverse(input, low + 1, right, indices);
+    }
+    return low;
+}
+
+void resize_by_opencv_fp(float *input_image, int input_width, int input_height, int boxes_num, float *output_image, int target_width, int target_height)
+{
+    for (int b = 0; b < boxes_num; b++)
+    {
+        cv::Mat src_image(input_height, input_width, CV_32F, &input_image[b * input_width * input_height]);
+        cv::Mat dst_image;
+        cv::resize(src_image, dst_image, cv::Size(target_width, target_height), 0, 0, cv::INTER_LINEAR);
+        memcpy(&output_image[b * target_width * target_height], dst_image.data, target_width * target_height * sizeof(float));
+    }
+}
+
+void resize_by_opencv_uint8(uint8_t *input_image, int input_width, int input_height, int boxes_num, uint8_t *output_image, int target_width, int target_height)
+{
+    for (int b = 0; b < boxes_num; b++)
+    {
+        cv::Mat src_image(input_height, input_width, CV_8U, &input_image[b * input_width * input_height]);
+        cv::Mat dst_image;
+        cv::resize(src_image, dst_image, cv::Size(target_width, target_height), 0, 0, cv::INTER_LINEAR);
+        memcpy(&output_image[b * target_width * target_height], dst_image.data, target_width * target_height * sizeof(uint8_t));
+    }
+}
+
+void crop_mask_fp(float *seg_mask, uint8_t *all_mask_in_one, float *boxes, int boxes_num, int *cls_id, int height, int width)
+{
+    for (int b = 0; b < boxes_num; b++)
+    {
+        float x1 = boxes[b * 4 + 0];
+        float y1 = boxes[b * 4 + 1];
+        float x2 = boxes[b * 4 + 2];
+        float y2 = boxes[b * 4 + 3];
+
+        for (int i = 0; i < height; i++)
+        {
+            for (int j = 0; j < width; j++)
+            {
+                if (j >= x1 && j < x2 && i >= y1 && i < y2)
+                {
+                    if (all_mask_in_one[i * width + j] == 0)
+                    {
+                        if (seg_mask[b * width * height + i * width + j] > 0)
+                        {
+                            all_mask_in_one[i * width + j] = (cls_id[b] + 1);
+                        }
+                        else
+                        {
+                            all_mask_in_one[i * width + j] = 0;
+                        }
+                    }
+                }
+            }
+        }
+    }
+}
+
+void crop_mask_uint8(uint8_t *seg_mask, uint8_t *all_mask_in_one, float *boxes, int boxes_num, int *cls_id, int height, int width)
+{
+    for (int b = 0; b < boxes_num; b++)
+    {
+        float x1 = boxes[b * 4 + 0];
+        float y1 = boxes[b * 4 + 1];
+        float x2 = boxes[b * 4 + 2];
+        float y2 = boxes[b * 4 + 3];
+
+        for (int i = 0; i < height; i++)
+        {
+            for (int j = 0; j < width; j++)
+            {
+                if (j >= x1 && j < x2 && i >= y1 && i < y2)
+                {
+                    if (all_mask_in_one[i * width + j] == 0)
+                    {
+                        if (seg_mask[b * width * height + i * width + j] > 0)
+                        {
+                            all_mask_in_one[i * width + j] = (cls_id[b] + 1);
+                        }
+                        else
+                        {
+                            all_mask_in_one[i * width + j] = 0;
+                        }
+                    }
+                }
+            }
+        }
+    }
+}
+
+void matmul_by_cpu_fp(std::vector<float> &A, float *B, float *C, int ROWS_A, int COLS_A, int COLS_B)
+{
+
+    float temp = 0;
+    for (int i = 0; i < ROWS_A; i++)
+    {
+        for (int j = 0; j < COLS_B; j++)
+        {
+            temp = 0;
+            for (int k = 0; k < COLS_A; k++)
+            {
+                temp += A[i * COLS_A + k] * B[k * COLS_B + j];
+            }
+            C[i * COLS_B + j] = temp;
+        }
+    }
+}
+
+void matmul_by_cpu_uint8(std::vector<float> &A, float *B, uint8_t *C, int ROWS_A, int COLS_A, int COLS_B)
+{
+
+    float temp = 0;
+    for (int i = 0; i < ROWS_A; i++)
+    {
+        for (int j = 0; j < COLS_B; j++)
+        {
+            temp = 0;
+            for (int k = 0; k < COLS_A; k++)
+            {
+                temp += A[i * COLS_A + k] * B[k * COLS_B + j];
+            }
+            if (temp > 0)
+            {
+                C[i * COLS_B + j] = 4;
+            }
+            else
+            {
+                C[i * COLS_B + j] = 0;
+            }
+        }
+    }
+}
+
+void seg_reverse(uint8_t *seg_mask, uint8_t *cropped_seg, uint8_t *seg_mask_real,
+                 int model_in_height, int model_in_width, int cropped_height, int cropped_width, int ori_in_height, int ori_in_width, int y_pad, int x_pad)
+{
+
+    if (y_pad == 0 && x_pad == 0 && ori_in_height == model_in_height && ori_in_width == model_in_width)
+    {
+        memcpy(seg_mask_real, seg_mask, ori_in_height * ori_in_width);
+        return;
+    }
+
+    int cropped_index = 0;
+    for (int i = 0; i < model_in_height; i++)
+    {
+        for (int j = 0; j < model_in_width; j++)
+        {
+            if (i >= y_pad && i < model_in_height - y_pad && j >= x_pad && j < model_in_width - x_pad)
+            {
+                int seg_index = i * model_in_width + j;
+                cropped_seg[cropped_index] = seg_mask[seg_index];
+                cropped_index++;
+            }
+        }
+    }
+    resize_by_opencv_uint8(cropped_seg, cropped_width, cropped_height, 1, seg_mask_real, ori_in_width, ori_in_height);
+}
+
+int box_reverse(int position, int boundary, int pad, float scale)
+{
+    return (int)((clamp(position, 0, boundary) - pad) / scale);
+}
+
+static float sigmoid(float x) { return 1.0 / (1.0 + expf(-x)); }
+
+static float unsigmoid(float y) { return -1.0 * logf((1.0 / y) - 1.0); }
+
+inline static int32_t __clip(float val, float min, float max)
+{
+    float f = val <= min ? min : (val >= max ? max : val);
+    return f;
+}
+
+static float deqnt_affine_to_f32(uint8_t qnt, uint8_t zp, float scale)
+{
+    return ((float)qnt - (float)zp) * scale;
+}
+
+static uint8_t qnt_f32_to_affine(float f32, uint8_t zp, float scale)
+{
+    float dst_val = (f32 / scale) + zp;
+    uint8_t res = (uint8_t)__clip(dst_val, 0, 255);
+    return res;
+}
+
+static int process_u8(rknn_output *all_input, int input_id, int *anchor, int grid_h, int grid_w, int height, int width, int stride,
+                      std::vector<float> &boxes, std::vector<float> &segments, float *proto, std::vector<float> &objProbs, std::vector<int> &classId, float threshold,
+                      rknn_app_context_t *app_ctx)
+{
+
+    int validCount = 0;
+    int grid_len = grid_h * grid_w;
+
+    if (input_id % 2 == 1)
+    {
+        return validCount;
+    }
+
+    if (input_id == 6)
+    {
+        uint8_t *input_proto = (uint8_t *)all_input[input_id].buf;
+        uint8_t zp_proto = app_ctx->output_attrs[input_id].zp;
+        float scale_proto = app_ctx->output_attrs[input_id].scale;
+        for (int i = 0; i < PROTO_CHANNEL * PROTO_HEIGHT * PROTO_WEIGHT; i++)
+        {
+            proto[i] = deqnt_affine_to_f32(input_proto[i], zp_proto, scale_proto);
+        }
+        return validCount;
+    }
+
+    uint8_t *input = (uint8_t *)all_input[input_id].buf;
+    uint8_t *input_seg = (uint8_t *)all_input[input_id + 1].buf;
+    uint8_t zp = app_ctx->output_attrs[input_id].zp;
+    float scale = app_ctx->output_attrs[input_id].scale;
+    uint8_t zp_seg = app_ctx->output_attrs[input_id + 1].zp;
+    float scale_seg = app_ctx->output_attrs[input_id + 1].scale;
+
+    uint8_t thres_u8 = qnt_f32_to_affine(threshold, zp, scale);
+
+    for (int a = 0; a < 3; a++)
+    {
+        for (int i = 0; i < grid_h; i++)
+        {
+            for (int j = 0; j < grid_w; j++)
+            {
+                uint8_t box_confidence = input[(PROP_BOX_SIZE * a + 4) * grid_len + i * grid_w + j];
+                if (box_confidence >= thres_u8)
+                {
+                    int offset = (PROP_BOX_SIZE * a) * grid_len + i * grid_w + j;
+                    int offset_seg = (PROTO_CHANNEL * a) * grid_len + i * grid_w + j;
+                    uint8_t *in_ptr = input + offset;
+                    uint8_t *in_ptr_seg = input_seg + offset_seg;
+
+                    float box_x = (deqnt_affine_to_f32(*in_ptr, zp, scale)) * 2.0 - 0.5;
+                    float box_y = (deqnt_affine_to_f32(in_ptr[grid_len], zp, scale)) * 2.0 - 0.5;
+                    float box_w = (deqnt_affine_to_f32(in_ptr[2 * grid_len], zp, scale)) * 2.0;
+                    float box_h = (deqnt_affine_to_f32(in_ptr[3 * grid_len], zp, scale)) * 2.0;
+                    box_x = (box_x + j) * (float)stride;
+                    box_y = (box_y + i) * (float)stride;
+                    box_w = box_w * box_w * (float)anchor[a * 2];
+                    box_h = box_h * box_h * (float)anchor[a * 2 + 1];
+                    box_x -= (box_w / 2.0);
+                    box_y -= (box_h / 2.0);
+
+                    uint8_t maxClassProbs = in_ptr[5 * grid_len];
+                    int maxClassId = 0;
+                    for (int k = 1; k < OBJ_CLASS_NUM; ++k)
+                    {
+                        uint8_t prob = in_ptr[(5 + k) * grid_len];
+                        if (prob > maxClassProbs)
+                        {
+                            maxClassId = k;
+                            maxClassProbs = prob;
+                        }
+                    }
+
+                    float box_conf_f32 = deqnt_affine_to_f32(box_confidence, zp, scale);
+                    float class_prob_f32 = deqnt_affine_to_f32(maxClassProbs, zp, scale);
+                    float limit_score = box_conf_f32 * class_prob_f32;
+                    // if (maxClassProbs > thres_u8)
+                    if (limit_score > threshold)
+                    {
+                        for (int k = 0; k < PROTO_CHANNEL; k++)
+                        {
+                            float seg_element_fp = deqnt_affine_to_f32(in_ptr_seg[(k)*grid_len], zp_seg, scale_seg);
+                            segments.push_back(seg_element_fp);
+                        }
+
+                        objProbs.push_back((deqnt_affine_to_f32(maxClassProbs, zp, scale)) * (deqnt_affine_to_f32(box_confidence, zp, scale)));
+                        classId.push_back(maxClassId);
+                        validCount++;
+                        boxes.push_back(box_x);
+                        boxes.push_back(box_y);
+                        boxes.push_back(box_w);
+                        boxes.push_back(box_h);
+                    }
+                }
+            }
+        }
+    }
+    return validCount;
+}
+
+static int process_fp32(rknn_output *all_input, int input_id, int *anchor, int grid_h, int grid_w, int height, int width, int stride,
+                        std::vector<float> &boxes, std::vector<float> &segments, float *proto, std::vector<float> &objProbs, std::vector<int> &classId, float threshold)
+{
+    int validCount = 0;
+    int grid_len = grid_h * grid_w;
+
+    if (input_id % 2 == 1)
+    {
+        return validCount;
+    }
+
+    if (input_id == 6)
+    {
+        float *input_proto = (float *)all_input[input_id].buf;
+        for (int i = 0; i < PROTO_CHANNEL * PROTO_HEIGHT * PROTO_WEIGHT; i++)
+        {
+            proto[i] = input_proto[i];
+        }
+        return validCount;
+    }
+
+    float *input = (float *)all_input[input_id].buf;
+    float *input_seg = (float *)all_input[input_id + 1].buf;
+
+    for (int a = 0; a < 3; a++)
+    {
+        for (int i = 0; i < grid_h; i++)
+        {
+            for (int j = 0; j < grid_w; j++)
+            {
+                float box_confidence = input[(PROP_BOX_SIZE * a + 4) * grid_len + i * grid_w + j];
+                if (box_confidence >= threshold)
+                {
+                    int offset = (PROP_BOX_SIZE * a) * grid_len + i * grid_w + j;
+                    int offset_seg = (PROTO_CHANNEL * a) * grid_len + i * grid_w + j;
+                    float *in_ptr = input + offset;
+                    float *in_ptr_seg = input_seg + offset_seg;
+
+                    float box_x = *in_ptr * 2.0 - 0.5;
+                    float box_y = in_ptr[grid_len] * 2.0 - 0.5;
+                    float box_w = in_ptr[2 * grid_len] * 2.0;
+                    float box_h = in_ptr[3 * grid_len] * 2.0;
+                    box_x = (box_x + j) * (float)stride;
+                    box_y = (box_y + i) * (float)stride;
+                    box_w = box_w * box_w * (float)anchor[a * 2];
+                    box_h = box_h * box_h * (float)anchor[a * 2 + 1];
+                    box_x -= (box_w / 2.0);
+                    box_y -= (box_h / 2.0);
+
+                    float maxClassProbs = in_ptr[5 * grid_len];
+                    int maxClassId = 0;
+                    for (int k = 1; k < OBJ_CLASS_NUM; ++k)
+                    {
+                        float prob = in_ptr[(5 + k) * grid_len];
+                        if (prob > maxClassProbs)
+                        {
+                            maxClassId = k;
+                            maxClassProbs = prob;
+                        }
+                    }
+                    float limit_score = maxClassProbs * box_confidence;
+                    // if (maxClassProbs > threshold)
+                    if (limit_score > threshold)
+                    {
+                        for (int k = 0; k < PROTO_CHANNEL; k++)
+                        {
+                            float seg_element_f32 = in_ptr_seg[(k)*grid_len];
+                            segments.push_back(seg_element_f32);
+                        }
+
+                        objProbs.push_back(maxClassProbs * box_confidence);
+                        classId.push_back(maxClassId);
+                        validCount++;
+                        boxes.push_back(box_x);
+                        boxes.push_back(box_y);
+                        boxes.push_back(box_w);
+                        boxes.push_back(box_h);
+                    }
+                }
+            }
+        }
+    }
+    return validCount;
+}
+
+int post_process(rknn_app_context_t *app_ctx, rknn_output *outputs, letterbox_t *letter_box, float conf_threshold, float nms_threshold, object_detect_result_list *od_results)
+{
+
+    std::vector<float> filterBoxes;
+    std::vector<float> objProbs;
+    std::vector<int> classId;
+
+    std::vector<float> filterSegments;
+    float proto[PROTO_CHANNEL * PROTO_HEIGHT * PROTO_WEIGHT];
+    std::vector<float> filterSegments_by_nms;
+
+    int model_in_width = app_ctx->model_width;
+    int model_in_height = app_ctx->model_height;
+
+    int validCount = 0;
+    int stride = 0;
+    int grid_h = 0;
+    int grid_w = 0;
+
+    memset(od_results, 0, sizeof(object_detect_result_list));
+
+    // process the outputs of rknn
+    for (int i = 0; i < 7; i++)
+    {
+
+        grid_h = app_ctx->output_attrs[i].dims[1];
+        grid_w = app_ctx->output_attrs[i].dims[0];
+        stride = model_in_height / grid_h;
+        if (app_ctx->is_quant)
+        {
+            validCount += process_u8(outputs, i, (int *)anchor[i / 2], grid_h, grid_w, model_in_height, model_in_width, stride, filterBoxes, filterSegments, proto, objProbs,
+                                     classId, conf_threshold, app_ctx);
+        }
+        else
+        {
+            validCount += process_fp32(outputs, i, (int *)anchor[i / 2], grid_h, grid_w, model_in_height, model_in_width, stride, filterBoxes, filterSegments, proto, objProbs,
+                                       classId, conf_threshold);
+        }
+    }
+
+    // nms
+    if (validCount <= 0)
+    {
+        return 0;
+    }
+    std::vector<int> indexArray;
+    for (int i = 0; i < validCount; ++i)
+    {
+        indexArray.push_back(i);
+    }
+
+    quick_sort_indice_inverse(objProbs, 0, validCount - 1, indexArray);
+
+    std::set<int> class_set(std::begin(classId), std::end(classId));
+
+    for (auto c : class_set)
+    {
+        nms(validCount, filterBoxes, classId, indexArray, c, nms_threshold);
+    }
+
+    int last_count = 0;
+    od_results->count = 0;
+
+    for (int i = 0; i < validCount; ++i)
+    {
+        if (indexArray[i] == -1 || last_count >= OBJ_NUMB_MAX_SIZE)
+        {
+            continue;
+        }
+        int n = indexArray[i];
+
+        float x1 = filterBoxes[n * 4 + 0];
+        float y1 = filterBoxes[n * 4 + 1];
+        float x2 = x1 + filterBoxes[n * 4 + 2];
+        float y2 = y1 + filterBoxes[n * 4 + 3];
+        int id = classId[n];
+        float obj_conf = objProbs[i];
+
+        for (int k = 0; k < PROTO_CHANNEL; k++)
+        {
+            filterSegments_by_nms.push_back(filterSegments[n * PROTO_CHANNEL + k]);
+        }
+
+        od_results->results[last_count].box.left = x1;
+        od_results->results[last_count].box.top = y1;
+        od_results->results[last_count].box.right = x2;
+        od_results->results[last_count].box.bottom = y2;
+
+        od_results->results[last_count].prop = obj_conf;
+        od_results->results[last_count].cls_id = id;
+        last_count++;
+    }
+    od_results->count = last_count;
+    int boxes_num = od_results->count;
+
+    float filterBoxes_by_nms[boxes_num * 4];
+    int cls_id[boxes_num];
+    for (int i = 0; i < boxes_num; i++)
+    {
+        // for crop_mask
+        filterBoxes_by_nms[i * 4 + 0] = od_results->results[i].box.left;   // x1;
+        filterBoxes_by_nms[i * 4 + 1] = od_results->results[i].box.top;    // y1;
+        filterBoxes_by_nms[i * 4 + 2] = od_results->results[i].box.right;  // x2;
+        filterBoxes_by_nms[i * 4 + 3] = od_results->results[i].box.bottom; // y2;
+        cls_id[i] = od_results->results[i].cls_id;
+
+        // get real box
+        od_results->results[i].box.left = box_reverse(od_results->results[i].box.left, model_in_width, letter_box->x_pad, letter_box->scale);
+        od_results->results[i].box.top = box_reverse(od_results->results[i].box.top, model_in_height, letter_box->y_pad, letter_box->scale);
+        od_results->results[i].box.right = box_reverse(od_results->results[i].box.right, model_in_width, letter_box->x_pad, letter_box->scale);
+        od_results->results[i].box.bottom = box_reverse(od_results->results[i].box.bottom, model_in_height, letter_box->y_pad, letter_box->scale);
+    }
+
+    TIMER timer;
+#ifdef USE_FP_RESIZE
+    timer.tik();
+    // compute the mask through Matmul
+    int ROWS_A = boxes_num;
+    int COLS_A = PROTO_CHANNEL;
+    int COLS_B = PROTO_HEIGHT * PROTO_WEIGHT;
+    float *matmul_out = (float *)malloc(boxes_num * PROTO_HEIGHT * PROTO_WEIGHT * sizeof(float));
+    matmul_by_cpu_fp(filterSegments_by_nms, proto, matmul_out, ROWS_A, COLS_A, COLS_B);
+    timer.tok();
+    timer.print_time("matmul_by_cpu_fp");
+
+    timer.tik();
+    // resize to (boxes_num, model_in_width, model_in_height)
+    float *seg_mask = (float *)malloc(boxes_num * model_in_height * model_in_width * sizeof(float));
+    resize_by_opencv_fp(matmul_out, PROTO_WEIGHT, PROTO_HEIGHT, boxes_num, seg_mask, model_in_width, model_in_height);
+    timer.tok();
+    timer.print_time("resize_by_opencv_fp");
+
+    timer.tik();
+    // crop mask
+    uint8_t *all_mask_in_one = (uint8_t *)malloc(model_in_height * model_in_width * sizeof(uint8_t));
+    memset(all_mask_in_one, 0, model_in_height * model_in_width * sizeof(uint8_t));
+    crop_mask_fp(seg_mask, all_mask_in_one, filterBoxes_by_nms, boxes_num, cls_id, model_in_height, model_in_width);
+    timer.tok();
+    timer.print_time("crop_mask_fp");
+#else
+    timer.tik();
+    // compute the mask through Matmul
+    int ROWS_A = boxes_num;
+    int COLS_A = PROTO_CHANNEL;
+    int COLS_B = PROTO_HEIGHT * PROTO_WEIGHT;
+    uint8_t *matmul_out = (uint8_t *)malloc(boxes_num * PROTO_HEIGHT * PROTO_WEIGHT * sizeof(uint8_t));
+    matmul_by_cpu_uint8(filterSegments_by_nms, proto, matmul_out, ROWS_A, COLS_A, COLS_B);
+    timer.tok();
+    timer.print_time("matmul_by_cpu_uint8");
+
+    timer.tik();
+    uint8_t *seg_mask = (uint8_t *)malloc(boxes_num * model_in_height * model_in_width * sizeof(uint8_t));
+    resize_by_opencv_uint8(matmul_out, PROTO_WEIGHT, PROTO_HEIGHT, boxes_num, seg_mask, model_in_width, model_in_height);
+    timer.tok();
+    timer.print_time("resize_by_opencv_uint8");
+
+    timer.tik();
+    // crop mask
+    uint8_t *all_mask_in_one = (uint8_t *)malloc(model_in_height * model_in_width * sizeof(uint8_t));
+    memset(all_mask_in_one, 0, model_in_height * model_in_width * sizeof(uint8_t));
+    crop_mask_uint8(seg_mask, all_mask_in_one, filterBoxes_by_nms, boxes_num, cls_id, model_in_height, model_in_width);
+    timer.tok();
+    timer.print_time("crop_mask_uint8");
+#endif
+
+    timer.tik();
+    // get real mask
+    int cropped_height = model_in_height - letter_box->y_pad * 2;
+    int cropped_width = model_in_width - letter_box->x_pad * 2;
+    int ori_in_height = app_ctx->input_image_height;
+    int ori_in_width = app_ctx->input_image_width;
+    int y_pad = letter_box->y_pad;
+    int x_pad = letter_box->x_pad;
+    uint8_t *cropped_seg_mask = (uint8_t *)malloc(cropped_height * cropped_width * sizeof(uint8_t));
+    uint8_t *real_seg_mask = (uint8_t *)malloc(ori_in_height * ori_in_width * sizeof(uint8_t));
+    seg_reverse(all_mask_in_one, cropped_seg_mask, real_seg_mask,
+                model_in_height, model_in_width, cropped_height, cropped_width, ori_in_height, ori_in_width, y_pad, x_pad);
+    od_results->results_seg[0].seg_mask = real_seg_mask;
+    free(all_mask_in_one);
+    free(cropped_seg_mask);
+    free(seg_mask);
+    free(matmul_out);
+    timer.tok();
+    timer.print_time("seg_reverse");
+
+    return 0;
+}
+
+int init_post_process()
+{
+    int ret = 0;
+    ret = loadLabelName(LABEL_NALE_TXT_PATH, labels);
+    if (ret < 0)
+    {
+        printf("Load %s failed!\n", LABEL_NALE_TXT_PATH);
+        return -1;
+    }
+    return 0;
+}
+
+char *coco_cls_to_name(int cls_id)
+{
+
+    if (cls_id >= OBJ_CLASS_NUM)
+    {
+        return "null";
+    }
+
+    if (labels[cls_id])
+    {
+        return labels[cls_id];
+    }
+
+    return "null";
+}
+
+void deinit_post_process()
+{
+    for (int i = 0; i < OBJ_CLASS_NUM; i++)
+    {
+        {
+            free(labels[i]);
+            labels[i] = nullptr;
+        }
+    }
+}
diff --git a/examples/yolov5_seg/cpp/rknpu1/yolov5_seg.cc b/examples/yolov5_seg/cpp/rknpu1/yolov5_seg.cc
new file mode 100644
index 0000000..53f3863
--- /dev/null
+++ b/examples/yolov5_seg/cpp/rknpu1/yolov5_seg.cc
@@ -0,0 +1,238 @@
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+#include <math.h>
+
+#include "yolov5_seg.h"
+#include "common.h"
+#include "file_utils.h"
+#include "image_utils.h"
+
+static void dump_tensor_attr(rknn_tensor_attr *attr)
+{
+    printf("  index=%d, name=%s, n_dims=%d, dims=[%d, %d, %d, %d], n_elems=%d, size=%d, fmt=%s, type=%s, qnt_type=%s, "
+           "zp=%d, scale=%f\n",
+           attr->index, attr->name, attr->n_dims, attr->dims[3], attr->dims[2], attr->dims[1], attr->dims[0],
+           attr->n_elems, attr->size, get_format_string(attr->fmt), get_type_string(attr->type),
+           get_qnt_type_string(attr->qnt_type), attr->zp, attr->scale);
+}
+
+int init_yolov5_seg_model(const char *model_path, rknn_app_context_t *app_ctx)
+{
+
+    int ret;
+    int model_len = 0;
+    char *model;
+    rknn_context ctx = 0;
+
+    // Load RKNN Model
+    model_len = read_data_from_file(model_path, &model);
+    if (model == NULL)
+    {
+        printf("load_model fail!\n");
+        return -1;
+    }
+
+    ret = rknn_init(&ctx, model, model_len, 0);
+    free(model);
+    if (ret < 0)
+    {
+        printf("rknn_init fail! ret=%d\n", ret);
+        return -1;
+    }
+
+    // Get Model Input Output Number
+    rknn_input_output_num io_num;
+    ret = rknn_query(ctx, RKNN_QUERY_IN_OUT_NUM, &io_num, sizeof(io_num));
+    if (ret != RKNN_SUCC)
+    {
+        printf("rknn_query fail! ret=%d\n", ret);
+        return -1;
+    }
+    printf("model input num: %d, output num: %d\n", io_num.n_input, io_num.n_output);
+
+    // Get Model Input Info
+    printf("input tensors:\n");
+    rknn_tensor_attr input_attrs[io_num.n_input];
+    memset(input_attrs, 0, sizeof(input_attrs));
+    for (int i = 0; i < io_num.n_input; i++)
+    {
+        input_attrs[i].index = i;
+        ret = rknn_query(ctx, RKNN_QUERY_INPUT_ATTR, &(input_attrs[i]), sizeof(rknn_tensor_attr));
+        if (ret != RKNN_SUCC)
+        {
+            printf("rknn_query fail! ret=%d\n", ret);
+            return -1;
+        }
+        dump_tensor_attr(&(input_attrs[i]));
+    }
+
+    // Get Model Output Info
+    printf("output tensors:\n");
+    rknn_tensor_attr output_attrs[io_num.n_output];
+    memset(output_attrs, 0, sizeof(output_attrs));
+    for (int i = 0; i < io_num.n_output; i++)
+    {
+        output_attrs[i].index = i;
+        ret = rknn_query(ctx, RKNN_QUERY_OUTPUT_ATTR, &(output_attrs[i]), sizeof(rknn_tensor_attr));
+        if (ret != RKNN_SUCC)
+        {
+            printf("rknn_query fail! ret=%d\n", ret);
+            return -1;
+        }
+        dump_tensor_attr(&(output_attrs[i]));
+    }
+
+    // Set to context
+    app_ctx->rknn_ctx = ctx;
+
+    if (output_attrs[0].qnt_type == RKNN_TENSOR_QNT_AFFINE_ASYMMETRIC && output_attrs[0].type != RKNN_TENSOR_FLOAT16)
+    {
+        app_ctx->is_quant = true;
+    }
+    else
+    {
+        app_ctx->is_quant = false;
+    }
+
+    app_ctx->io_num = io_num;
+    app_ctx->input_attrs = (rknn_tensor_attr *)malloc(io_num.n_input * sizeof(rknn_tensor_attr));
+    memcpy(app_ctx->input_attrs, input_attrs, io_num.n_input * sizeof(rknn_tensor_attr));
+    app_ctx->output_attrs = (rknn_tensor_attr *)malloc(io_num.n_output * sizeof(rknn_tensor_attr));
+    memcpy(app_ctx->output_attrs, output_attrs, io_num.n_output * sizeof(rknn_tensor_attr));
+
+    if (input_attrs[0].fmt == RKNN_TENSOR_NCHW)
+    {
+        printf("model is NCHW input fmt\n");
+        app_ctx->model_channel = input_attrs[0].dims[2];
+        app_ctx->model_height = input_attrs[0].dims[1];
+        app_ctx->model_width = input_attrs[0].dims[0];
+    }
+    else
+    {
+        printf("model is NHWC input fmt\n");
+        app_ctx->model_height = input_attrs[0].dims[2];
+        app_ctx->model_width = input_attrs[0].dims[1];
+        app_ctx->model_channel = input_attrs[0].dims[0];
+    }
+    printf("model input height=%d, width=%d, channel=%d\n",
+           app_ctx->model_height, app_ctx->model_width, app_ctx->model_channel);
+
+    return 0;
+}
+
+int release_yolov5_seg_model(rknn_app_context_t *app_ctx)
+{
+    if (app_ctx->input_attrs != NULL)
+    {
+        free(app_ctx->input_attrs);
+        app_ctx->input_attrs = NULL;
+    }
+    if (app_ctx->output_attrs != NULL)
+    {
+        free(app_ctx->output_attrs);
+        app_ctx->output_attrs = NULL;
+    }
+    if (app_ctx->rknn_ctx != 0)
+    {
+        rknn_destroy(app_ctx->rknn_ctx);
+        app_ctx->rknn_ctx = 0;
+    }
+    return 0;
+}
+
+int inference_yolov5_seg_model(rknn_app_context_t *app_ctx, image_buffer_t *img, object_detect_result_list *od_results)
+{
+    int ret;
+    image_buffer_t dst_img;
+    letterbox_t letter_box;
+    rknn_input inputs[app_ctx->io_num.n_input];
+    rknn_output outputs[app_ctx->io_num.n_output];
+    const float nms_threshold = NMS_THRESH;
+    const float box_conf_threshold = BOX_THRESH;
+    int bg_color = 114; // pad color for letterbox
+
+    if ((!app_ctx) || !(img) || (!od_results))
+    {
+        return -1;
+    }
+
+    memset(od_results, 0x00, sizeof(*od_results));
+    memset(&letter_box, 0, sizeof(letterbox_t));
+    memset(&dst_img, 0, sizeof(image_buffer_t));
+    memset(inputs, 0, sizeof(inputs));
+    memset(outputs, 0, sizeof(outputs));
+
+    // Pre Process
+    app_ctx->input_image_width = img->width;
+    app_ctx->input_image_height = img->height;
+    dst_img.width = app_ctx->model_width;
+    dst_img.height = app_ctx->model_height;
+    dst_img.format = IMAGE_FORMAT_RGB888;
+    dst_img.size = get_image_size(&dst_img);
+    dst_img.virt_addr = (unsigned char *)malloc(dst_img.size);
+    if (dst_img.virt_addr == NULL)
+    {
+        printf("malloc buffer size:%d fail!\n", dst_img.size);
+        return -1;
+    }
+
+    // letterbox
+    ret = convert_image_with_letterbox(img, &dst_img, &letter_box, bg_color);
+    if (ret < 0)
+    {
+        printf("convert_image_with_letterbox fail! ret=%d\n", ret);
+        goto out;
+    }
+
+    // Set Input Data
+    inputs[0].index = 0;
+    inputs[0].type = RKNN_TENSOR_UINT8;
+    inputs[0].fmt = RKNN_TENSOR_NHWC;
+    inputs[0].size = app_ctx->model_width * app_ctx->model_height * app_ctx->model_channel;
+    inputs[0].buf = dst_img.virt_addr;
+
+    ret = rknn_inputs_set(app_ctx->rknn_ctx, app_ctx->io_num.n_input, inputs);
+    if (ret < 0)
+    {
+        printf("rknn_input_set fail! ret=%d\n", ret);
+        goto out;
+    }
+
+    // Run
+    printf("rknn_run\n");
+    ret = rknn_run(app_ctx->rknn_ctx, nullptr);
+    if (ret < 0)
+    {
+        printf("rknn_run fail! ret=%d\n", ret);
+        goto out;
+    }
+
+    // Get Output
+    memset(outputs, 0, sizeof(outputs));
+    for (int i = 0; i < app_ctx->io_num.n_output; i++)
+    {
+        outputs[i].index = i;
+        outputs[i].want_float = (!app_ctx->is_quant);
+    }
+    ret = rknn_outputs_get(app_ctx->rknn_ctx, app_ctx->io_num.n_output, outputs, NULL);
+    if (ret < 0)
+    {
+        printf("rknn_outputs_get fail! ret=%d\n", ret);
+        goto out;
+    }
+
+    // Post Process
+    post_process(app_ctx, outputs, &letter_box, box_conf_threshold, nms_threshold, od_results);
+
+    // Remeber to release rknn output
+    rknn_outputs_release(app_ctx->rknn_ctx, app_ctx->io_num.n_output, outputs);
+
+out:
+    if (dst_img.virt_addr != NULL)
+    {
+        free(dst_img.virt_addr);
+    }
+
+    return ret;
+}
\ No newline at end of file
diff --git a/examples/yolov5_seg/cpp/postprocess.cc b/examples/yolov5_seg/cpp/rknpu2/postprocess.cc
similarity index 70%
rename from examples/yolov5_seg/cpp/postprocess.cc
rename to examples/yolov5_seg/cpp/rknpu2/postprocess.cc
index 1af0538..ada0bd8 100644
--- a/examples/yolov5_seg/cpp/postprocess.cc
+++ b/examples/yolov5_seg/cpp/rknpu2/postprocess.cc
@@ -23,13 +23,15 @@
 #include <opencv2/opencv.hpp>
 #include "rknn_matmul_api.h"
 #include "im2d.hpp"
-#include "dma_alloc.cpp"
-#include "drm_alloc.cpp"
+#include "dma_alloc.hpp"
+#include "drm_alloc.hpp"
 #include "Float16.h"
+#include "easy_timer.h"
 
 #include <set>
 #include <vector>
 #define LABEL_NALE_TXT_PATH "./model/coco_80_labels_list.txt"
+// #define USE_FP_RESIZE
 
 static char *labels[OBJ_CLASS_NUM];
 
@@ -190,12 +192,26 @@ static int quick_sort_indice_inverse(std::vector<float> &input, int left, int ri
     return low;
 }
 
-void resize_by_opencv(uint8_t *input_image, int input_width, int input_height, uint8_t *output_image, int target_width, int target_height)
+void resize_by_opencv_fp(float *input_image, int input_width, int input_height, int boxes_num, float *output_image, int target_width, int target_height)
 {
-    cv::Mat src_image(input_height, input_width, CV_8U, input_image);
-    cv::Mat dst_image;
-    cv::resize(src_image, dst_image, cv::Size(target_width, target_height), 0, 0, cv::INTER_LINEAR);
-    memcpy(output_image, dst_image.data, target_width * target_height);
+    for (int b = 0; b < boxes_num; b++)
+    {
+        cv::Mat src_image(input_height, input_width, CV_32F, &input_image[b * input_width * input_height]);
+        cv::Mat dst_image;
+        cv::resize(src_image, dst_image, cv::Size(target_width, target_height), 0, 0, cv::INTER_LINEAR);
+        memcpy(&output_image[b * target_width * target_height], dst_image.data, target_width * target_height * sizeof(float));
+    }
+}
+
+void resize_by_opencv_uint8(uint8_t *input_image, int input_width, int input_height, int boxes_num, uint8_t *output_image, int target_width, int target_height)
+{
+    for (int b = 0; b < boxes_num; b++)
+    {
+        cv::Mat src_image(input_height, input_width, CV_8U, &input_image[b * input_width * input_height]);
+        cv::Mat dst_image;
+        cv::resize(src_image, dst_image, cv::Size(target_width, target_height), 0, 0, cv::INTER_LINEAR);
+        memcpy(&output_image[b * target_width * target_height], dst_image.data, target_width * target_height * sizeof(uint8_t));
+    }
 }
 
 void resize_by_rga_rk3588(uint8_t *input_image, int input_width, int input_height, uint8_t *output_image, int target_width, int target_height)
@@ -230,7 +246,15 @@ void resize_by_rga_rk3588(uint8_t *input_image, int input_width, int input_heigh
     dst = wrapbuffer_handle(dst_handle, dst_width, dst_height, dst_format);
     src = wrapbuffer_handle(src_handle, src_width, src_height, src_format);
 
-    imresize(src, dst);
+    int ret = imresize(src, dst);
+    if (ret == IM_STATUS_SUCCESS)
+    {
+        printf("%s running success!\n", "rga_resize");
+    }
+    else
+    {
+        printf("%s running failed, %s\n", "rga_resize", imStrError((IM_STATUS)ret));
+    }
 
     memcpy(output_image, dst_buf, target_width * target_height);
 
@@ -286,7 +310,15 @@ void resize_by_rga_rk356x(uint8_t *input_image, int input_width, int input_heigh
     dst = wrapbuffer_handle(dst_handle, dst_width, dst_height, dst_format);
     src = wrapbuffer_handle(src_handle, src_width, src_height, src_format);
 
-    imresize(src, dst);
+    int ret = imresize(src, dst);
+    if (ret == IM_STATUS_SUCCESS)
+    {
+        printf("%s running success!\n", "rga_resize");
+    }
+    else
+    {
+        printf("%s running failed, %s\n", "rga_resize", imStrError((IM_STATUS)ret));
+    }
 
     memcpy(output_image, drm_dst.drm_buf, target_width * target_height);
 
@@ -296,7 +328,7 @@ void resize_by_rga_rk356x(uint8_t *input_image, int input_width, int input_heigh
     drm_buf_destroy(drm_dst.drm_buffer_fd, drm_dst.drm_buffer_handle, drm_dst.drm_buf, drm_dst.actual_size);
 }
 
-void crop_mask(uint8_t *seg_mask, uint8_t *all_mask_in_one, float *boxes, int boxes_num, int *cls_id, int height, int width)
+void crop_mask_fp(float *seg_mask, uint8_t *all_mask_in_one, float *boxes, int boxes_num, int *cls_id, int height, int width)
 {
     for (int b = 0; b < boxes_num; b++)
     {
@@ -313,7 +345,14 @@ void crop_mask(uint8_t *seg_mask, uint8_t *all_mask_in_one, float *boxes, int bo
                 {
                     if (all_mask_in_one[i * width + j] == 0)
                     {
-                        all_mask_in_one[i * width + j] = seg_mask[b * width * height + i * width + j] * (cls_id[b] + 1);
+                        if (seg_mask[b * width * height + i * width + j] > 0)
+                        {
+                            all_mask_in_one[i * width + j] = (cls_id[b] + 1);
+                        }
+                        else
+                        {
+                            all_mask_in_one[i * width + j] = 0;
+                        }
                     }
                 }
             }
@@ -321,83 +360,82 @@ void crop_mask(uint8_t *seg_mask, uint8_t *all_mask_in_one, float *boxes, int bo
     }
 }
 
-void matmul_by_npu_i8(std::vector<float> &A_input, float *B_input, uint8_t *C_input, int ROWS_A, int COLS_A, int COLS_B, rknn_app_context_t *app_ctx)
+void crop_mask_uint8(uint8_t *seg_mask, uint8_t *all_mask_in_one, float *boxes, int boxes_num, int *cls_id, int height, int width)
 {
-    int B_layout = 0;
-    int AC_layout = 0;
-    int32_t M = 1;
-    int32_t K = COLS_A;
-    int32_t N = COLS_B;
-
-    rknn_matmul_ctx ctx;
-    rknn_matmul_info info;
-    memset(&info, 0, sizeof(rknn_matmul_info));
-    info.M = M;
-    info.K = K;
-    info.N = N;
-    info.type = RKNN_INT8_MM_INT8_TO_INT32;
-    info.B_layout = B_layout;
-    info.AC_layout = AC_layout;
-
-    rknn_matmul_io_attr io_attr;
-    memset(&io_attr, 0, sizeof(rknn_matmul_io_attr));
-
-    int8_t int8Vector_A[ROWS_A * COLS_A];
-    for (int i = 0; i < ROWS_A * COLS_A; ++i)
+    for (int b = 0; b < boxes_num; b++)
     {
-        int8Vector_A[i] = (int8_t)A_input[i];
-    }
+        float x1 = boxes[b * 4 + 0];
+        float y1 = boxes[b * 4 + 1];
+        float x2 = boxes[b * 4 + 2];
+        float y2 = boxes[b * 4 + 3];
 
-    int8_t int8Vector_B[COLS_A * COLS_B];
-    for (int i = 0; i < COLS_A * COLS_B; ++i)
-    {
-        int8Vector_B[i] = (int8_t)B_input[i];
+        for (int i = 0; i < height; i++)
+        {
+            for (int j = 0; j < width; j++)
+            {
+                if (j >= x1 && j < x2 && i >= y1 && i < y2)
+                {
+                    if (all_mask_in_one[i * width + j] == 0)
+                    {
+                        if (seg_mask[b * width * height + i * width + j] > 0)
+                        {
+                            all_mask_in_one[i * width + j] = (cls_id[b] + 1);
+                        }
+                        else
+                        {
+                            all_mask_in_one[i * width + j] = 0;
+                        }
+                    }
+                }
+            }
+        }
     }
+}
 
-    int ret = rknn_matmul_create(&ctx, &info, &io_attr);
-    // Create A
-    rknn_tensor_mem *A = rknn_create_mem(ctx, io_attr.A.size);
-    // Create B
-    rknn_tensor_mem *B = rknn_create_mem(ctx, io_attr.B.size);
-    // Create C
-    rknn_tensor_mem *C = rknn_create_mem(ctx, io_attr.C.size);
-
-    memcpy(B->virt_addr, int8Vector_B, B->size);
-    // Set A
-    ret = rknn_matmul_set_io_mem(ctx, A, &io_attr.A);
-    // Set B
-    ret = rknn_matmul_set_io_mem(ctx, B, &io_attr.B);
-    // Set C
-    ret = rknn_matmul_set_io_mem(ctx, C, &io_attr.C);
+void matmul_by_cpu_fp(std::vector<float> &A, float *B, float *C, int ROWS_A, int COLS_A, int COLS_B)
+{
 
-    for (int i = 0; i < ROWS_A; ++i)
+    float temp = 0;
+    for (int i = 0; i < ROWS_A; i++)
     {
-        memcpy(A->virt_addr, int8Vector_A + i * A->size, A->size);
+        for (int j = 0; j < COLS_B; j++)
+        {
+            temp = 0;
+            for (int k = 0; k < COLS_A; k++)
+            {
+                temp += A[i * COLS_A + k] * B[k * COLS_B + j];
+            }
+            C[i * COLS_B + j] = temp;
+        }
+    }
+}
 
-        // Run
-        ret = rknn_matmul_run(ctx);
+void matmul_by_cpu_uint8(std::vector<float> &A, float *B, uint8_t *C, int ROWS_A, int COLS_A, int COLS_B)
+{
 
-        for (int j = 0; j < COLS_B; ++j)
+    float temp = 0;
+    for (int i = 0; i < ROWS_A; i++)
+    {
+        for (int j = 0; j < COLS_B; j++)
         {
-            if (((int32_t *)C->virt_addr)[j] > 0)
+            temp = 0;
+            for (int k = 0; k < COLS_A; k++)
+            {
+                temp += A[i * COLS_A + k] * B[k * COLS_B + j];
+            }
+            if (temp > 0)
             {
-                C_input[i * COLS_B + j] = 1;
+                C[i * COLS_B + j] = 4;
             }
             else
             {
-                C_input[i * COLS_B + j] = 0;
+                C[i * COLS_B + j] = 0;
             }
         }
     }
-
-    // destroy
-    rknn_destroy_mem(ctx, A);
-    rknn_destroy_mem(ctx, B);
-    rknn_destroy_mem(ctx, C);
-    rknn_matmul_destroy(ctx);
 }
 
-void matmul_by_npu_fp16(std::vector<float> &A_input, float *B_input, uint8_t *C_input, int ROWS_A, int COLS_A, int COLS_B, rknn_app_context_t *app_ctx)
+void matmul_by_npu_fp(std::vector<float> &A_input, float *B_input, float *C_input, int ROWS_A, int COLS_A, int COLS_B, rknn_app_context_t *app_ctx)
 {
     int B_layout = 0;
     int AC_layout = 0;
@@ -452,14 +490,7 @@ void matmul_by_npu_fp16(std::vector<float> &A_input, float *B_input, uint8_t *C_
     ret = rknn_matmul_run(ctx);
     for (int i = 0; i < ROWS_A * COLS_B; ++i)
     {
-        if (((float *)C->virt_addr)[i] > 0)
-        {
-            C_input[i] = 1;
-        }
-        else
-        {
-            C_input[i] = 0;
-        }
+        C_input[i] = ((float *)C->virt_addr)[i];
     }
 
     // destroy
@@ -470,24 +501,31 @@ void matmul_by_npu_fp16(std::vector<float> &A_input, float *B_input, uint8_t *C_
 }
 
 void seg_reverse(uint8_t *seg_mask, uint8_t *cropped_seg, uint8_t *seg_mask_real,
-                 int model_in_height, int model_in_width, int proto_height, int proto_width, int cropped_height, int cropped_width, int ori_in_height, int ori_in_width, int y_pad, int x_pad)
+                 int model_in_height, int model_in_width, int cropped_height, int cropped_width, int ori_in_height, int ori_in_width, int y_pad, int x_pad)
 {
+
+    if (y_pad == 0 && x_pad == 0 && ori_in_height == model_in_height && ori_in_width == model_in_width)
+    {
+        memcpy(seg_mask_real, seg_mask, ori_in_height * ori_in_width);
+        return;
+    }
+
     int cropped_index = 0;
-    for (int i = 0; i < proto_height; i++)
+    for (int i = 0; i < model_in_height; i++)
     {
-        for (int j = 0; j < proto_width; j++)
+        for (int j = 0; j < model_in_width; j++)
         {
-            if (i >= y_pad && i < proto_height - y_pad && j >= x_pad && j < proto_width - x_pad)
+            if (i >= y_pad && i < model_in_height - y_pad && j >= x_pad && j < model_in_width - x_pad)
             {
-                int seg_index = i * proto_width + j;
+                int seg_index = i * model_in_width + j;
                 cropped_seg[cropped_index] = seg_mask[seg_index];
                 cropped_index++;
             }
         }
     }
-
-    // Note: Here are different methods provided for implementing single-channel image scaling
-    resize_by_opencv(cropped_seg, cropped_width, cropped_height, seg_mask_real, ori_in_width, ori_in_height);
+    // Note: Here are different methods provided for implementing single-channel image scaling.
+    //       The method of using rga to resize the image requires that the image size is 2 aligned.
+    resize_by_opencv_uint8(cropped_seg, cropped_width, cropped_height, 1, seg_mask_real, ori_in_width, ori_in_height);
     // resize_by_rga_rk356x(cropped_seg, cropped_width, cropped_height, seg_mask_real, ori_in_width, ori_in_height);
     // resize_by_rga_rk3588(cropped_seg, cropped_width, cropped_height, seg_mask_real, ori_in_width, ori_in_height);
 }
@@ -533,9 +571,10 @@ static int process_i8(rknn_output *all_input, int input_id, int *anchor, int gri
     {
         int8_t *input_proto = (int8_t *)all_input[input_id].buf;
         int32_t zp_proto = app_ctx->output_attrs[input_id].zp;
-        for (int i = 0; i < 32 * grid_len; i++)
+        float scale_proto = app_ctx->output_attrs[input_id].scale;
+        for (int i = 0; i < PROTO_CHANNEL * PROTO_HEIGHT * PROTO_WEIGHT; i++)
         {
-            proto[i] = input_proto[i] - zp_proto;
+            proto[i] = deqnt_affine_to_f32(input_proto[i], zp_proto, scale_proto);
         }
         return validCount;
     }
@@ -559,7 +598,7 @@ static int process_i8(rknn_output *all_input, int input_id, int *anchor, int gri
                 if (box_confidence >= thres_i8)
                 {
                     int offset = (PROP_BOX_SIZE * a) * grid_len + i * grid_w + j;
-                    int offset_seg = (32 * a) * grid_len + i * grid_w + j;
+                    int offset_seg = (PROTO_CHANNEL * a) * grid_len + i * grid_w + j;
                     int8_t *in_ptr = input + offset;
                     int8_t *in_ptr_seg = input_seg + offset_seg;
 
@@ -589,14 +628,13 @@ static int process_i8(rknn_output *all_input, int input_id, int *anchor, int gri
                     float box_conf_f32 = deqnt_affine_to_f32(box_confidence, zp, scale);
                     float class_prob_f32 = deqnt_affine_to_f32(maxClassProbs, zp, scale);
                     float limit_score = box_conf_f32 * class_prob_f32;
-                    if (maxClassProbs > thres_i8)
-                    // if (limit_score > threshold)
+                    // if (maxClassProbs > thres_i8)
+                    if (limit_score > threshold)
                     {
-
-                        for (int k = 0; k < 32; k++)
+                        for (int k = 0; k < PROTO_CHANNEL; k++)
                         {
-                            int8_t seg_element_i8 = in_ptr_seg[(k)*grid_len] - zp_seg;
-                            segments.push_back(seg_element_i8);
+                            float seg_element_fp = deqnt_affine_to_f32(in_ptr_seg[(k)*grid_len], zp_seg, scale_seg);
+                            segments.push_back(seg_element_fp);
                         }
 
                         objProbs.push_back((deqnt_affine_to_f32(maxClassProbs, zp, scale)) * (deqnt_affine_to_f32(box_confidence, zp, scale)));
@@ -628,7 +666,7 @@ static int process_fp32(rknn_output *all_input, int input_id, int *anchor, int g
     if (input_id == 6)
     {
         float *input_proto = (float *)all_input[input_id].buf;
-        for (int i = 0; i < 32 * grid_len; i++)
+        for (int i = 0; i < PROTO_CHANNEL * PROTO_HEIGHT * PROTO_WEIGHT; i++)
         {
             proto[i] = input_proto[i];
         }
@@ -648,7 +686,7 @@ static int process_fp32(rknn_output *all_input, int input_id, int *anchor, int g
                 if (box_confidence >= threshold)
                 {
                     int offset = (PROP_BOX_SIZE * a) * grid_len + i * grid_w + j;
-                    int offset_seg = (32 * a) * grid_len + i * grid_w + j;
+                    int offset_seg = (PROTO_CHANNEL * a) * grid_len + i * grid_w + j;
                     float *in_ptr = input + offset;
                     float *in_ptr_seg = input_seg + offset_seg;
 
@@ -674,9 +712,11 @@ static int process_fp32(rknn_output *all_input, int input_id, int *anchor, int g
                             maxClassProbs = prob;
                         }
                     }
-                    if (maxClassProbs > threshold)
+                    float limit_score = maxClassProbs * box_confidence;
+                    // if (maxClassProbs > threshold)
+                    if (limit_score > threshold)
                     {
-                        for (int k = 0; k < 32; k++)
+                        for (int k = 0; k < PROTO_CHANNEL; k++)
                         {
                             float seg_element_f32 = in_ptr_seg[(k)*grid_len];
                             segments.push_back(seg_element_f32);
@@ -699,17 +739,16 @@ static int process_fp32(rknn_output *all_input, int input_id, int *anchor, int g
 
 int post_process(rknn_app_context_t *app_ctx, rknn_output *outputs, letterbox_t *letter_box, float conf_threshold, float nms_threshold, object_detect_result_list *od_results)
 {
-
     std::vector<float> filterBoxes;
     std::vector<float> objProbs;
     std::vector<int> classId;
 
     std::vector<float> filterSegments;
-    float proto[32 * 160 * 160];
+    float proto[PROTO_CHANNEL * PROTO_HEIGHT * PROTO_WEIGHT];
     std::vector<float> filterSegments_by_nms;
 
-    int model_in_w = app_ctx->model_width;
-    int model_in_h = app_ctx->model_height;
+    int model_in_width = app_ctx->model_width;
+    int model_in_height = app_ctx->model_height;
 
     int validCount = 0;
     int stride = 0;
@@ -723,16 +762,16 @@ int post_process(rknn_app_context_t *app_ctx, rknn_output *outputs, letterbox_t
     {
         grid_h = app_ctx->output_attrs[i].dims[2];
         grid_w = app_ctx->output_attrs[i].dims[3];
-        stride = model_in_h / grid_h;
+        stride = model_in_height / grid_h;
 
         if (app_ctx->is_quant)
         {
-            validCount += process_i8(outputs, i, (int *)anchor[i / 2], grid_h, grid_w, model_in_h, model_in_w, stride, filterBoxes, filterSegments, proto, objProbs,
+            validCount += process_i8(outputs, i, (int *)anchor[i / 2], grid_h, grid_w, model_in_height, model_in_width, stride, filterBoxes, filterSegments, proto, objProbs,
                                      classId, conf_threshold, app_ctx);
         }
         else
         {
-            validCount += process_fp32(outputs, i, (int *)anchor[i / 2], grid_h, grid_w, model_in_h, model_in_w, stride, filterBoxes, filterSegments, proto, objProbs,
+            validCount += process_fp32(outputs, i, (int *)anchor[i / 2], grid_h, grid_w, model_in_height, model_in_width, stride, filterBoxes, filterSegments, proto, objProbs,
                                        classId, conf_threshold);
         }
     }
@@ -775,9 +814,9 @@ int post_process(rknn_app_context_t *app_ctx, rknn_output *outputs, letterbox_t
         int id = classId[n];
         float obj_conf = objProbs[i];
 
-        for (int k = 0; k < 32; k++)
+        for (int k = 0; k < PROTO_CHANNEL; k++)
         {
-            filterSegments_by_nms.push_back(filterSegments[n * 32 + k]);
+            filterSegments_by_nms.push_back(filterSegments[n * PROTO_CHANNEL + k]);
         }
 
         od_results->results[last_count].box.left = x1;
@@ -790,60 +829,99 @@ int post_process(rknn_app_context_t *app_ctx, rknn_output *outputs, letterbox_t
         last_count++;
     }
     od_results->count = last_count;
-
     int boxes_num = od_results->count;
 
-    // compute the mask (binary matrix) through Matmul
-    int ROWS_A = boxes_num;
-    int COLS_A = 32;
-    int COLS_B = 160 * 160;
-    uint8_t matmul_out[boxes_num * 160 * 160];
-    if (app_ctx->is_quant)
-    {
-        matmul_by_npu_i8(filterSegments_by_nms, proto, matmul_out, ROWS_A, COLS_A, COLS_B, app_ctx);
-    }
-    else
-    {
-        matmul_by_npu_fp16(filterSegments_by_nms, proto, matmul_out, ROWS_A, COLS_A, COLS_B, app_ctx);
-    }
-
     float filterBoxes_by_nms[boxes_num * 4];
     int cls_id[boxes_num];
     for (int i = 0; i < boxes_num; i++)
     {
         // for crop_mask
-        filterBoxes_by_nms[i * 4 + 0] = od_results->results[i].box.left / 4.0;   // x1;
-        filterBoxes_by_nms[i * 4 + 1] = od_results->results[i].box.top / 4.0;    // y1;
-        filterBoxes_by_nms[i * 4 + 2] = od_results->results[i].box.right / 4.0;  // x2;
-        filterBoxes_by_nms[i * 4 + 3] = od_results->results[i].box.bottom / 4.0; // y2;
+        filterBoxes_by_nms[i * 4 + 0] = od_results->results[i].box.left;   // x1;
+        filterBoxes_by_nms[i * 4 + 1] = od_results->results[i].box.top;    // y1;
+        filterBoxes_by_nms[i * 4 + 2] = od_results->results[i].box.right;  // x2;
+        filterBoxes_by_nms[i * 4 + 3] = od_results->results[i].box.bottom; // y2;
         cls_id[i] = od_results->results[i].cls_id;
 
         // get real box
-        od_results->results[i].box.left = box_reverse(od_results->results[i].box.left, model_in_w, letter_box->x_pad, letter_box->scale);
-        od_results->results[i].box.top = box_reverse(od_results->results[i].box.top, model_in_h, letter_box->y_pad, letter_box->scale);
-        od_results->results[i].box.right = box_reverse(od_results->results[i].box.right, model_in_w, letter_box->x_pad, letter_box->scale);
-        od_results->results[i].box.bottom = box_reverse(od_results->results[i].box.bottom, model_in_h, letter_box->y_pad, letter_box->scale);
+        od_results->results[i].box.left = box_reverse(od_results->results[i].box.left, model_in_width, letter_box->x_pad, letter_box->scale);
+        od_results->results[i].box.top = box_reverse(od_results->results[i].box.top, model_in_height, letter_box->y_pad, letter_box->scale);
+        od_results->results[i].box.right = box_reverse(od_results->results[i].box.right, model_in_width, letter_box->x_pad, letter_box->scale);
+        od_results->results[i].box.bottom = box_reverse(od_results->results[i].box.bottom, model_in_height, letter_box->y_pad, letter_box->scale);
     }
 
-    // crop seg outside box
-    int proto_height = 160;
-    int proto_width = 160;
-    uint8_t all_mask_in_one[160 * 160] = {0};
-    crop_mask(matmul_out, all_mask_in_one, filterBoxes_by_nms, boxes_num, cls_id, proto_height, proto_width);
-
+    TIMER timer;
+#ifdef USE_FP_RESIZE
+    timer.tik();
+    // compute the mask through Matmul
+    int ROWS_A = boxes_num;
+    int COLS_A = PROTO_CHANNEL;
+    int COLS_B = PROTO_HEIGHT * PROTO_WEIGHT;
+    float *matmul_out = (float *)malloc(boxes_num * PROTO_HEIGHT * PROTO_WEIGHT * sizeof(float));
+    matmul_by_cpu_fp(filterSegments_by_nms, proto, matmul_out, ROWS_A, COLS_A, COLS_B);
+    // matmul_by_npu_fp(filterSegments_by_nms, proto, matmul_out, ROWS_A, COLS_A, COLS_B, app_ctx);
+    timer.tok();
+    timer.print_time("matmul_by_cpu_fp");
+
+    timer.tik();
+    // resize to (boxes_num, model_in_width, model_in_height)
+    float *seg_mask = (float *)malloc(boxes_num * model_in_height * model_in_width * sizeof(float));
+    resize_by_opencv_fp(matmul_out, PROTO_WEIGHT, PROTO_HEIGHT, boxes_num, seg_mask, model_in_width, model_in_height);
+    timer.tok();
+    timer.print_time("resize_by_opencv_fp");
+
+    timer.tik();
+    // crop mask
+    uint8_t *all_mask_in_one = (uint8_t *)malloc(model_in_height * model_in_width * sizeof(uint8_t));
+    memset(all_mask_in_one, 0, model_in_height * model_in_width * sizeof(uint8_t));
+    crop_mask_fp(seg_mask, all_mask_in_one, filterBoxes_by_nms, boxes_num, cls_id, model_in_height, model_in_width);
+    timer.tok();
+    timer.print_time("crop_mask_fp");
+#else
+    timer.tik();
+    // compute the mask through Matmul
+    int ROWS_A = boxes_num;
+    int COLS_A = PROTO_CHANNEL;
+    int COLS_B = PROTO_HEIGHT * PROTO_WEIGHT;
+    uint8_t *matmul_out = (uint8_t *)malloc(boxes_num * PROTO_HEIGHT * PROTO_WEIGHT * sizeof(uint8_t));
+    matmul_by_cpu_uint8(filterSegments_by_nms, proto, matmul_out, ROWS_A, COLS_A, COLS_B);
+
+    timer.tok();
+    timer.print_time("matmul_by_cpu_uint8");
+
+    timer.tik();
+    uint8_t *seg_mask = (uint8_t *)malloc(boxes_num * model_in_height * model_in_width * sizeof(uint8_t));
+    resize_by_opencv_uint8(matmul_out, PROTO_WEIGHT, PROTO_HEIGHT, boxes_num, seg_mask, model_in_width, model_in_height);
+    timer.tok();
+    timer.print_time("resize_by_opencv_uint8");
+
+    timer.tik();
+    // crop mask
+    uint8_t *all_mask_in_one = (uint8_t *)malloc(model_in_height * model_in_width * sizeof(uint8_t));
+    memset(all_mask_in_one, 0, model_in_height * model_in_width * sizeof(uint8_t));
+    crop_mask_uint8(seg_mask, all_mask_in_one, filterBoxes_by_nms, boxes_num, cls_id, model_in_height, model_in_width);
+    timer.tok();
+    timer.print_time("crop_mask_uint8");
+#endif
+
+    timer.tik();
     // get real mask
-    int cropped_height = proto_height - letter_box->y_pad / 4 * 2;
-    int cropped_width = proto_width - letter_box->x_pad / 4 * 2;
-    int y_pad = letter_box->y_pad / 4;
-    int x_pad = letter_box->x_pad / 4;
-    int ori_in_height = (model_in_h - letter_box->y_pad * 2) / letter_box->scale;
-    int ori_in_width = (model_in_w - letter_box->x_pad * 2) / letter_box->scale;
+    int cropped_height = model_in_height - letter_box->y_pad * 2;
+    int cropped_width = model_in_width - letter_box->x_pad * 2;
+    int ori_in_height = app_ctx->input_image_height;
+    int ori_in_width = app_ctx->input_image_width;
+    int y_pad = letter_box->y_pad;
+    int x_pad = letter_box->x_pad;
     uint8_t *cropped_seg_mask = (uint8_t *)malloc(cropped_height * cropped_width * sizeof(uint8_t));
     uint8_t *real_seg_mask = (uint8_t *)malloc(ori_in_height * ori_in_width * sizeof(uint8_t));
     seg_reverse(all_mask_in_one, cropped_seg_mask, real_seg_mask,
-                model_in_h, model_in_w, proto_height, proto_width, cropped_height, cropped_width, ori_in_height, ori_in_width, y_pad, x_pad);
+                model_in_height, model_in_width, cropped_height, cropped_width, ori_in_height, ori_in_width, y_pad, x_pad);
     od_results->results_seg[0].seg_mask = real_seg_mask;
+    free(all_mask_in_one);
     free(cropped_seg_mask);
+    free(seg_mask);
+    free(matmul_out);
+    timer.tok();
+    timer.print_time("seg_reverse");
 
     return 0;
 }
diff --git a/examples/yolov5_seg/cpp/rknpu2/yolov5_seg.cc b/examples/yolov5_seg/cpp/rknpu2/yolov5_seg.cc
index 9ffafd3..81b63cc 100644
--- a/examples/yolov5_seg/cpp/rknpu2/yolov5_seg.cc
+++ b/examples/yolov5_seg/cpp/rknpu2/yolov5_seg.cc
@@ -86,7 +86,7 @@ int init_yolov5_seg_model(const char *model_path, rknn_app_context_t *app_ctx)
     // Set to context
     app_ctx->rknn_ctx = ctx;
 
-    if (output_attrs[0].type == RKNN_TENSOR_INT8)
+    if (output_attrs[0].qnt_type == RKNN_TENSOR_QNT_AFFINE_ASYMMETRIC && output_attrs[0].type != RKNN_TENSOR_FLOAT16)
     {
         app_ctx->is_quant = true;
     }
@@ -123,11 +123,6 @@ int init_yolov5_seg_model(const char *model_path, rknn_app_context_t *app_ctx)
 
 int release_yolov5_seg_model(rknn_app_context_t *app_ctx)
 {
-    if (app_ctx->rknn_ctx != 0)
-    {
-        rknn_destroy(app_ctx->rknn_ctx);
-        app_ctx->rknn_ctx = 0;
-    }
     if (app_ctx->input_attrs != NULL)
     {
         free(app_ctx->input_attrs);
@@ -138,6 +133,11 @@ int release_yolov5_seg_model(rknn_app_context_t *app_ctx)
         free(app_ctx->output_attrs);
         app_ctx->output_attrs = NULL;
     }
+    if (app_ctx->rknn_ctx != 0)
+    {
+        rknn_destroy(app_ctx->rknn_ctx);
+        app_ctx->rknn_ctx = 0;
+    }
     return 0;
 }
 
@@ -164,6 +164,8 @@ int inference_yolov5_seg_model(rknn_app_context_t *app_ctx, image_buffer_t *img,
     memset(outputs, 0, sizeof(outputs));
 
     // Pre Process
+    app_ctx->input_image_width = img->width;
+    app_ctx->input_image_height = img->height;
     dst_img.width = app_ctx->model_width;
     dst_img.height = app_ctx->model_height;
     dst_img.format = IMAGE_FORMAT_RGB888;
diff --git a/examples/yolov5_seg/cpp/yolov5_seg.h b/examples/yolov5_seg/cpp/yolov5_seg.h
index 7d5efdb..6d23327 100644
--- a/examples/yolov5_seg/cpp/yolov5_seg.h
+++ b/examples/yolov5_seg/cpp/yolov5_seg.h
@@ -27,6 +27,8 @@ typedef struct {
     int model_channel;
     int model_width;
     int model_height;
+    int input_image_width;
+    int input_image_height;
     bool is_quant;
 } rknn_app_context_t;
 
diff --git a/examples/yolov5_seg/model_comparison/yolov5_seg_graph_comparison.jpg b/examples/yolov5_seg/model_comparison/yolov5_seg_graph_comparison.jpg
new file mode 100644
index 0000000..5062b74
Binary files /dev/null and b/examples/yolov5_seg/model_comparison/yolov5_seg_graph_comparison.jpg differ
diff --git a/examples/yolov5_seg/model_comparison/yolov5_seg_output_comparison.jpg b/examples/yolov5_seg/model_comparison/yolov5_seg_output_comparison.jpg
new file mode 100644
index 0000000..be13bb6
Binary files /dev/null and b/examples/yolov5_seg/model_comparison/yolov5_seg_output_comparison.jpg differ
diff --git a/examples/yolov5_seg/python/convert.py b/examples/yolov5_seg/python/convert.py
index 9917750..22deb40 100644
--- a/examples/yolov5_seg/python/convert.py
+++ b/examples/yolov5_seg/python/convert.py
@@ -1,52 +1,53 @@
-import os
 import sys
-import numpy as np
 from rknn.api import RKNN
 
 DATASET_PATH = '../../../datasets/COCO/coco_subset_20.txt'
+DEFAULT_RKNN_PATH = '../model/yolov5_seg.rknn'
+DEFAULT_QUANT = True
 
-if __name__ == '__main__':
-
+def parse_arg():
     if len(sys.argv) < 3:
-        print(
-            "Usage: python3 {} onnx_model_path [platform] [dtype(optional)] [output_rknn_path(optional)]".format(sys.argv[0]))
-        print("       platform choose from [rk3562,rk3566,rk3568,rk3588]")
-        print("       dtype choose from    [i8, fp]")
+        print("Usage: python3 {} onnx_model_path [platform] [dtype(optional)] [output_rknn_path(optional)]".format(sys.argv[0]));
+        print("       platform choose from [rk3562, rk3566, rk3568, rk3588, rk1808, rv1109, rv1126]")
+        print("       dtype choose from    [i8, fp] for [rk3562,rk3566,rk3568,rk3588]")
+        print("       dtype choose from    [u8, fp] for [rk1808,rv1109,rv1126]")
         exit(1)
 
     model_path = sys.argv[1]
     platform = sys.argv[2]
 
+    do_quant = DEFAULT_QUANT
     if len(sys.argv) > 3:
         model_type = sys.argv[3]
-        if model_type not in ['i8', 'fp']:
+        if model_type not in ['i8', 'u8', 'fp']:
             print("ERROR: Invalid model type: {}".format(model_type))
             exit(1)
-        elif model_type == 'i8':
+        elif model_type in ['i8', 'u8']:
             do_quant = True
         else:
             do_quant = False
-    else:
-        do_quant = True
 
     if len(sys.argv) > 4:
         output_path = sys.argv[4]
     else:
-        output_path = model_path.replace('.onnx', '.rknn')
+        output_path = DEFAULT_RKNN_PATH
+
+    return model_path, platform, do_quant, output_path
+
+if __name__ == '__main__':
+    model_path, platform, do_quant, output_path = parse_arg()
 
     # Create RKNN object
     rknn = RKNN(verbose=False)
 
     # Pre-process config
     print('--> Config model')
-    rknn.config(mean_values=[[0, 0, 0]], std_values=[
-                    [255, 255, 255]], target_platform=platform)
+    rknn.config(mean_values=[[0, 0, 0]], std_values=[[255, 255, 255]], target_platform=platform)
     print('done')
 
     # Load model
     print('--> Loading model')
     ret = rknn.load_onnx(model=model_path)
-    # ret = rknn.load_pytorch(model=model_path, input_size_list=[[1, 3, 640, 640]])
     if ret != 0:
         print('Load model failed!')
         exit(ret)
@@ -66,7 +67,6 @@
     if ret != 0:
         print('Export rknn model failed!')
         exit(ret)
-    print('--> The RKNN model saved in: {}'.format(output_path))
     print('done')
 
     # Release
diff --git a/examples/yolov5_seg/reference_results/yolov5s_seg_c_demo_result.png b/examples/yolov5_seg/reference_results/yolov5s_seg_c_demo_result.png
index ab04135..ed22c09 100644
Binary files a/examples/yolov5_seg/reference_results/yolov5s_seg_c_demo_result.png and b/examples/yolov5_seg/reference_results/yolov5s_seg_c_demo_result.png differ
diff --git a/examples/yolov5_seg/reference_results/yolov5s_seg_python_demo_result.jpg b/examples/yolov5_seg/reference_results/yolov5s_seg_python_demo_result.jpg
deleted file mode 100644
index d4ba1c1..0000000
Binary files a/examples/yolov5_seg/reference_results/yolov5s_seg_python_demo_result.jpg and /dev/null differ
diff --git a/examples/yolov5_seg/reference_results/yolov5s_seg_python_demo_result.png b/examples/yolov5_seg/reference_results/yolov5s_seg_python_demo_result.png
new file mode 100644
index 0000000..d6aa868
Binary files /dev/null and b/examples/yolov5_seg/reference_results/yolov5s_seg_python_demo_result.png differ
diff --git a/examples/yolov6/README.md b/examples/yolov6/README.md
index 51347c9..5c28d1e 100644
--- a/examples/yolov6/README.md
+++ b/examples/yolov6/README.md
@@ -31,7 +31,7 @@ https://github.com/airockchip/yolov6
 
 ## 2. Current Support Platform
 
-RK3566, RK3568, RK3588, RK3562
+RK3566, RK3568, RK3588, RK3562, RK1808, RV1109, RV1126
 
 
 
@@ -48,6 +48,18 @@ cd model
 ./download_model.sh
 ```
 
+**Note**: The model provided here is an optimized model, which is different from the official original model. Take yolov6n.onnx as an example to show the difference between them.
+1. The comparison of their output information is as follows. The left is the official original model, and the right is the optimized model. As shown in the figure, the original one output is split into three groups. For example, in the set of outputs ([1,4,80,80],[1,80,80,80],[1,1,80,80]), [1,4,80,80] is the coordinate of the box, [1,80,80,80] is the confidence of the box corresponding to the 80 categories, and [1,1,80,80] is the sum of the confidence of the 80 categories.
+
+<div align=center>
+  <img src="./model_comparison/yolov6_output_comparison.jpg" alt="Image">
+</div>
+
+2. Taking the the set of outputs ([1,4,80,80],[1,80,80,80],[1,1,80,80]) as an example, we remove the subgraphs behind the two convolution nodes in the model (the framed part in the figure), keep the outputs of these two convolutions ([1,4,80,80],[1,80,80,80]), and add a reducesum+clip branch for calculating the sum of the confidence of the 80 categories ([1,1,80,80]).
+
+<div align=center>
+  <img src="./model_comparison/yolov6_graph_comparison.jpg" alt="Image">
+</div>
 
 
 ## 4. Convert to RKNN
@@ -67,7 +79,7 @@ python convert.py ../model/yolov6n.onnx rk3588
 
 - `<onnx_model>`: Specify ONNX model path.
 - `<TARGET_PLATFORM>`: Specify NPU platform name. Support Platform refer [here](#2 Current Support Platform).
-- `<dtype>(optional)`: Specify as `i8` or `fp`. `i8` for doing quantization, `fp` for no quantization. Default is `i8`.
+- `<dtype>(optional)`: Specify as `i8`, `u8` or `fp`. `i8`/`u8` for doing quantization, `fp` for no quantization. Default is `i8`.
 - `<output_rknn_path>(optional)`: Specify save path for the RKNN model, default save in the same directory as ONNX model with name `yolov6.rknn`
 
 
@@ -95,30 +107,12 @@ python yolov6.py --model_path <rknn_model> --target <TARGET_PLATFORM> --img_show
 
 ## 6. Android Demo
 
-#### 6.1 Compile and Build
-
-*Usage:*
-
-```sh
-# go back to the rknn_model_zoo root directory
-cd ../../
-export ANDROID_NDK_PATH=<android_ndk_path>
-
-./build-android.sh -t <TARGET_PLATFORM> -a <ARCH> -d yolov6
+**Note: RK1808, RV1109, RV1126 does not support Android.**
 
-# such as 
-./build-android.sh -t rk3588 -a arm64-v8a -d yolov6
-```
+#### 6.1 Compile and Build
 
-*Description:*
-- `<android_ndk_path>`: Specify Android NDK path.
-- `<TARGET_PLATFORM>`: Specify NPU platform name. Support Platform refer [here](#2 Current Support Platform).
-- `<ARCH>`: Specify device system architecture. To query device architecture, refer to the following command:
-	
-	```shell
-	# Query architecture. For Android, ['arm64-v8a' or 'armeabi-v7a'] should shown in log.
-	adb shell cat /proc/version
-	```
+Please refer to the [Compilation_Environment_Setup_Guide](../../docs/Compilation_Environment_Setup_Guide.md#android-platform) document to setup a cross-compilation environment and complete the compilation of C/C++ Demo.  
+**Note: Please replace the model name with `yolov6`.**
 
 #### 6.2 Push demo files to device
 
@@ -154,31 +148,8 @@ export LD_LIBRARY_PATH=./lib
 
 #### 7.1 Compile and Build
 
-*Usage:*
-
-```shell
-# go back to the rknn_model_zoo root directory
-cd ../../
-
-# if GCC_COMPILER not found while building, please set GCC_COMPILER path
-(optional)export GCC_COMPILER=<GCC_COMPILER_PATH>
-
-./build-linux.sh -t <TARGET_PLATFORM> -a <ARCH> -d yolov6
-
-# such as 
-./build-linux.sh -t rk3588 -a aarch64 -d yolov6
-```
-
-*Description:*
-
-- `<GCC_COMPILER_PATH>`: Specified as GCC_COMPILER path.
-- `<TARGET_PLATFORM>` : Specify NPU platform name. Support Platform refer [here](#2 Current Support Platform).
-- `<ARCH>`: Specify device system architecture. To query device architecture, refer to the following command: 
-  
-  ```shell
-  # Query architecture. For Linux, ['aarch64' or 'armhf'] should shown in log.
-  adb shell cat /proc/version
-  ```
+Please refer to the [Compilation_Environment_Setup_Guide](../../docs/Compilation_Environment_Setup_Guide.md#linux-platform) document to setup a cross-compilation environment and complete the compilation of C/C++ Demo.  
+**Note: Please replace the model name with `yolov6`.**
 
 #### 7.2 Push demo files to device
 
diff --git a/examples/yolov6/cpp/CMakeLists.txt b/examples/yolov6/cpp/CMakeLists.txt
index c2a7ebb..14a6a40 100644
--- a/examples/yolov6/cpp/CMakeLists.txt
+++ b/examples/yolov6/cpp/CMakeLists.txt
@@ -9,6 +9,20 @@ if (ENABLE_ASAN)
 	set (CMAKE_LINKER_FLAGS_DEBUG "${CMAKE_LINKER_FLAGS_DEBUG} -fno-omit-frame-pointer -fsanitize=address")
 endif ()
 
+set(rknpu_yolov6_file rknpu2/yolov6.cc)
+
+if (TARGET_SOC STREQUAL "rv1106" OR TARGET_SOC STREQUAL "rv1103")
+    add_definitions(-DRV1106_1103)
+    set(rknpu_yolov6_file rknpu2/yolov6_rv1106_1103.cc)
+    #dma
+    include_directories(${CMAKE_CURRENT_SOURCE_DIR}/../../../3rdparty/allocator/dma)
+endif()
+
+if(TARGET_SOC STREQUAL "rk1808" OR TARGET_SOC STREQUAL "rv1109" OR TARGET_SOC STREQUAL "rv1126")
+    add_definitions(-DRKNPU1)
+    set(rknpu_yolov6_file rknpu1/yolov6.cc)
+endif()
+
 add_subdirectory(${CMAKE_CURRENT_SOURCE_DIR}/../../../3rdparty/ 3rdparty.out)
 add_subdirectory(${CMAKE_CURRENT_SOURCE_DIR}/../../../utils/ utils.out)
 
@@ -19,14 +33,15 @@ file(GLOB SRCS ${CMAKE_CURRENT_SOURCE_DIR}/*.cc)
 add_executable(${PROJECT_NAME}
     main.cc
     postprocess.cc
-    rknpu2/yolov6.cc
+    ${rknpu_yolov6_file}
 )
 
-target_link_libraries(${PROJECT_NAME}
-    fileutils
+target_link_libraries(${PROJECT_NAME}    
     imageutils
-    imagedrawing    
+    fileutils
+    imagedrawing  
     ${LIBRKNNRT}
+    dl
 )
 
 if (CMAKE_SYSTEM_NAME STREQUAL "Android")
diff --git a/examples/yolov6/cpp/main.cc b/examples/yolov6/cpp/main.cc
index da9121e..4f5367b 100644
--- a/examples/yolov6/cpp/main.cc
+++ b/examples/yolov6/cpp/main.cc
@@ -25,6 +25,10 @@
 #include "file_utils.h"
 #include "image_drawing.h"
 
+#if defined(RV1106_1103) 
+    #include "dma_alloc.hpp"
+#endif
+
 /*-------------------------------------------
                   Main Function
 -------------------------------------------*/
@@ -55,6 +59,19 @@ int main(int argc, char **argv)
     image_buffer_t src_image;
     memset(&src_image, 0, sizeof(image_buffer_t));
     ret = read_image(image_path, &src_image);
+
+#if defined(RV1106_1103) 
+    //RV1106 rga requires that input and output bufs are memory allocated by dma
+    ret = dma_buf_alloc(RV1106_CMA_HEAP_PATH, src_image.size, &rknn_app_ctx.img_dma_buf.dma_buf_fd, 
+                       (void **) & (rknn_app_ctx.img_dma_buf.dma_buf_virt_addr));
+    memcpy(rknn_app_ctx.img_dma_buf.dma_buf_virt_addr, src_image.virt_addr, src_image.size);
+    dma_sync_cpu_to_device(rknn_app_ctx.img_dma_buf.dma_buf_fd);
+    free(src_image.virt_addr);
+    src_image.virt_addr = (unsigned char *)rknn_app_ctx.img_dma_buf.dma_buf_virt_addr;
+    src_image.fd = rknn_app_ctx.img_dma_buf.dma_buf_fd;
+    rknn_app_ctx.img_dma_buf.size = src_image.size;
+#endif
+
     if (ret != 0)
     {
         printf("read image fail! ret=%d image_path=%s\n", ret, image_path);
@@ -103,7 +120,12 @@ int main(int argc, char **argv)
 
     if (src_image.virt_addr != NULL)
     {
+#if defined(RV1106_1103) 
+        dma_buf_free(rknn_app_ctx.img_dma_buf.size, &rknn_app_ctx.img_dma_buf.dma_buf_fd, 
+                rknn_app_ctx.img_dma_buf.dma_buf_virt_addr);
+#else
         free(src_image.virt_addr);
+#endif
     }
 
     return 0;
diff --git a/examples/yolov6/cpp/postprocess.cc b/examples/yolov6/cpp/postprocess.cc
index d8a664b..5b95bbd 100644
--- a/examples/yolov6/cpp/postprocess.cc
+++ b/examples/yolov6/cpp/postprocess.cc
@@ -194,10 +194,18 @@ static int8_t qnt_f32_to_affine(float f32, int32_t zp, float scale)
     return res;
 }
 
+static uint8_t qnt_f32_to_affine_u8(float f32, int32_t zp, float scale)
+{
+    float dst_val = (f32 / scale) + zp;
+    uint8_t res = (uint8_t)__clip(dst_val, 0, 255);
+    return res;
+}
+
 static float deqnt_affine_to_f32(int8_t qnt, int32_t zp, float scale) { return ((float)qnt - (float)zp) * scale; }
 
+static float deqnt_affine_u8_to_f32(uint8_t qnt, int32_t zp, float scale) { return ((float)qnt - (float)zp) * scale; }
 
-void compute_dfl(float* tensor, int dfl_len, float* box){
+static void compute_dfl(float* tensor, int dfl_len, float* box){
     for (int b=0; b<4; b++){
         float exp_t[dfl_len];
         float exp_sum=0;
@@ -214,6 +222,94 @@ void compute_dfl(float* tensor, int dfl_len, float* box){
     }
 }
 
+static int process_u8(uint8_t *box_tensor, int32_t box_zp, float box_scale,
+                      uint8_t *score_tensor, int32_t score_zp, float score_scale,
+                      uint8_t *score_sum_tensor, int32_t score_sum_zp, float score_sum_scale,
+                      int grid_h, int grid_w, int stride, int dfl_len,
+                      std::vector<float> &boxes,
+                      std::vector<float> &objProbs,
+                      std::vector<int> &classId,
+                      float threshold)
+{
+    int validCount = 0;
+    int grid_len = grid_h * grid_w;
+    uint8_t score_thres_u8 = qnt_f32_to_affine_u8(threshold, score_zp, score_scale);
+    uint8_t score_sum_thres_u8 = qnt_f32_to_affine_u8(threshold, score_sum_zp, score_sum_scale);
+
+    for (int i = 0; i < grid_h; i++)
+    {
+        for (int j = 0; j < grid_w; j++)
+        {
+            int offset = i * grid_w + j;
+            int max_class_id = -1;
+
+            // Use score sum to quickly filter
+            if (score_sum_tensor != nullptr)
+            {
+                if (score_sum_tensor[offset] < score_sum_thres_u8)
+                {
+                    continue;
+                }
+            }
+
+            uint8_t max_score = -score_zp;
+            for (int c = 0; c < OBJ_CLASS_NUM; c++)
+            {
+                if ((score_tensor[offset] > score_thres_u8) && (score_tensor[offset] > max_score))
+                {
+                    max_score = score_tensor[offset];
+                    max_class_id = c;
+                }
+                offset += grid_len;
+            }
+
+            // compute box
+            if (max_score > score_thres_u8)
+            {
+                offset = i * grid_w + j;
+                float box[4];
+                if (dfl_len > 1)
+                {
+                    /// dfl
+                    float before_dfl[dfl_len * 4];
+                    for (int k = 0; k < dfl_len * 4; k++)
+                    {
+                        before_dfl[k] = deqnt_affine_u8_to_f32(box_tensor[offset], box_zp, box_scale);
+                        offset += grid_len;
+                    }
+                    compute_dfl(before_dfl, dfl_len, box);
+                }
+                else
+                {
+                    for (int k = 0; k < 4; k++)
+                    {
+                        box[k] = deqnt_affine_u8_to_f32(box_tensor[offset], box_zp, box_scale);
+                        offset += grid_len;
+                    }
+                }
+
+                float x1, y1, x2, y2, w, h;
+                x1 = (-box[0] + j + 0.5) * stride;
+                y1 = (-box[1] + i + 0.5) * stride;
+                x2 = (box[2] + j + 0.5) * stride;
+                y2 = (box[3] + i + 0.5) * stride;
+                w = x2 - x1;
+                h = y2 - y1;
+                boxes.push_back(x1);
+                boxes.push_back(y1);
+                boxes.push_back(w);
+                boxes.push_back(h);
+
+                objProbs.push_back(deqnt_affine_u8_to_f32(max_score, score_zp, score_scale));
+                classId.push_back(max_class_id);
+                validCount++;
+            }
+        }
+    }
+    printf("validCount=%d\n", validCount);
+    printf("grid h-%d, w-%d, stride %d\n", grid_h, grid_w, stride);
+    return validCount;
+}
 
 static int process_i8(int8_t *box_tensor, int32_t box_zp, float box_scale,
                       int8_t *score_tensor, int32_t score_zp, float score_scale,
@@ -372,9 +468,81 @@ static int process_fp32(float *box_tensor, float *score_tensor, float *score_sum
     return validCount;
 }
 
+#if defined(RV1106_1103)
+static int process_i8_rv1106(int8_t *box_tensor, int32_t box_zp, float box_scale,
+                             int8_t *score_tensor, int32_t score_zp, float score_scale,
+                             int8_t *score_sum_tensor, int32_t score_sum_zp, float score_sum_scale,
+                             int grid_h, int grid_w, int stride, int dfl_len,
+                             std::vector<float> &boxes,
+                             std::vector<float> &objProbs,
+                             std::vector<int> &classId,
+                             float threshold) {
+    int validCount = 0;
+    int grid_len = grid_h * grid_w;
+    int8_t score_thres_i8 = qnt_f32_to_affine(threshold, score_zp, score_scale);
+    int8_t score_sum_thres_i8 = qnt_f32_to_affine(threshold, score_sum_zp, score_sum_scale);
+
+    for (int i = 0; i < grid_h; i++) {
+        for (int j = 0; j < grid_w; j++) {
+            int offset = i * grid_w + j;
+            int max_class_id = -1;
+
+            // 通过 score sum 起到快速过滤的作用
+            if (score_sum_tensor != nullptr) {
+                //score_sum_tensor [1, 1, 80, 80]
+                if (score_sum_tensor[offset] < score_sum_thres_i8) {
+                    continue;
+                }
+            }
+
+            int8_t max_score = -score_zp;
+            offset = offset * OBJ_CLASS_NUM;
+            for (int c = 0; c < OBJ_CLASS_NUM; c++) {
+                if ((score_tensor[offset + c] > score_thres_i8) && (score_tensor[offset + c] > max_score)) {
+                    max_score = score_tensor[offset + c]; //80类 [1, 80, 80, 80] 3588NCHW 1106NHWC
+                    max_class_id = c;
+                }
+            }
+
+            // compute box
+            if (max_score > score_thres_i8) {
+                offset = (i * grid_w + j) * 4;
+                float box[4];
+                for (int k = 0; k < 4; k++) {
+                    box[k] = deqnt_affine_to_f32(box_tensor[offset + k], box_zp, box_scale);
+                }
 
-int post_process(rknn_app_context_t *app_ctx, rknn_output *outputs, letterbox_t *letter_box, float conf_threshold, float nms_threshold, object_detect_result_list *od_results)
+                float x1, y1, x2, y2, w, h;
+                x1 = (-box[0] + j + 0.5) * stride;
+                y1 = (-box[1] + i + 0.5) * stride;
+                x2 = (box[2] + j + 0.5) * stride;
+                y2 = (box[3] + i + 0.5) * stride;
+                w = x2 - x1;
+                h = y2 - y1;
+                boxes.push_back(x1);
+                boxes.push_back(y1);
+                boxes.push_back(w);
+                boxes.push_back(h);
+
+                objProbs.push_back(deqnt_affine_to_f32(max_score, score_zp, score_scale));
+                classId.push_back(max_class_id);
+                validCount ++;
+            }
+        }
+    }
+    printf("validCount=%d\n", validCount);
+    printf("grid h-%d, w-%d, stride %d\n", grid_h, grid_w, stride);
+    return validCount;
+}
+#endif
+
+int post_process(rknn_app_context_t *app_ctx, void *outputs, letterbox_t *letter_box, float conf_threshold, float nms_threshold, object_detect_result_list *od_results) 
 {
+#if defined(RV1106_1103)
+    rknn_tensor_mem **_outputs = (rknn_tensor_mem **)outputs;
+#else
+    rknn_output *_outputs = (rknn_output *)outputs;
+#endif
     std::vector<float> filterBoxes;
     std::vector<float> objProbs;
     std::vector<int> classId;
@@ -388,41 +556,87 @@ int post_process(rknn_app_context_t *app_ctx, rknn_output *outputs, letterbox_t
     memset(od_results, 0, sizeof(object_detect_result_list));
 
     // default 3 branch
+#ifdef RKNPU1
+    // NCHW reversed: WHCN
+    int dfl_len = app_ctx->output_attrs[0].dims[2] / 4;
+#else
     int dfl_len = app_ctx->output_attrs[0].dims[1] /4;
+#endif
     int output_per_branch = app_ctx->io_num.n_output / 3;
     for (int i = 0; i < 3; i++)
     {
+#if defined(RV1106_1103)
+        void *score_sum = nullptr;
+        int32_t score_sum_zp = 0;
+        float score_sum_scale = 1.0;
+        if (output_per_branch == 3) {
+            score_sum = _outputs[i * output_per_branch + 2]->virt_addr;
+            score_sum_zp = app_ctx->output_attrs[i * output_per_branch + 2].zp;
+            score_sum_scale = app_ctx->output_attrs[i * output_per_branch + 2].scale;
+        }
+        int box_idx = i * output_per_branch;
+        int score_idx = i * output_per_branch + 1;
+        grid_h = app_ctx->output_attrs[box_idx].dims[1];
+        grid_w = app_ctx->output_attrs[box_idx].dims[2];
+        stride = model_in_h / grid_h;
+        
+        if (app_ctx->is_quant) {
+            validCount += process_i8_rv1106((int8_t *)_outputs[box_idx]->virt_addr, app_ctx->output_attrs[box_idx].zp, app_ctx->output_attrs[box_idx].scale,
+                                (int8_t *)_outputs[score_idx]->virt_addr, app_ctx->output_attrs[score_idx].zp,
+                                app_ctx->output_attrs[score_idx].scale, (int8_t *)score_sum, score_sum_zp, score_sum_scale,
+                                grid_h, grid_w, stride, dfl_len, filterBoxes, objProbs, classId, conf_threshold);
+        }
+        else
+        {
+            printf("RV1106/1103 only support quantization mode\n", LABEL_NALE_TXT_PATH);
+            return -1;
+        }
 
+#else
         void *score_sum = nullptr;
         int32_t score_sum_zp = 0;
         float score_sum_scale = 1.0;
         if (output_per_branch == 3){
-            score_sum = outputs[i*output_per_branch + 2].buf;
+            score_sum = _outputs[i*output_per_branch + 2].buf;
             score_sum_zp = app_ctx->output_attrs[i*output_per_branch + 2].zp;
             score_sum_scale = app_ctx->output_attrs[i*output_per_branch + 2].scale;
         }
         int box_idx = i*output_per_branch;
         int score_idx = i*output_per_branch + 1;
 
+#ifdef RKNPU1
+        // NCHW reversed: WHCN
+        grid_h = app_ctx->output_attrs[box_idx].dims[1];
+        grid_w = app_ctx->output_attrs[box_idx].dims[0];
+#else
         grid_h = app_ctx->output_attrs[box_idx].dims[2];
         grid_w = app_ctx->output_attrs[box_idx].dims[3];
+#endif
         stride = model_in_h / grid_h;
 
         if (app_ctx->is_quant)
         {
-            validCount += process_i8((int8_t *)outputs[box_idx].buf, app_ctx->output_attrs[box_idx].zp, app_ctx->output_attrs[box_idx].scale,
-                                     (int8_t *)outputs[score_idx].buf, app_ctx->output_attrs[score_idx].zp, app_ctx->output_attrs[score_idx].scale,
+#ifdef RKNPU1
+            validCount += process_u8((uint8_t *)_outputs[box_idx].buf, app_ctx->output_attrs[box_idx].zp, app_ctx->output_attrs[box_idx].scale,
+                                     (uint8_t *)_outputs[score_idx].buf, app_ctx->output_attrs[score_idx].zp, app_ctx->output_attrs[score_idx].scale,
+                                     (uint8_t *)score_sum, score_sum_zp, score_sum_scale,
+                                     grid_h, grid_w, stride, dfl_len,
+                                     filterBoxes, objProbs, classId, conf_threshold);
+#else
+            validCount += process_i8((int8_t *)_outputs[box_idx].buf, app_ctx->output_attrs[box_idx].zp, app_ctx->output_attrs[box_idx].scale,
+                                     (int8_t *)_outputs[score_idx].buf, app_ctx->output_attrs[score_idx].zp, app_ctx->output_attrs[score_idx].scale,
                                      (int8_t *)score_sum, score_sum_zp, score_sum_scale,
                                      grid_h, grid_w, stride, dfl_len, 
                                      filterBoxes, objProbs, classId, conf_threshold);
+#endif
         }
         else
         {
-            validCount += process_fp32((float *)outputs[box_idx].buf, (float *)outputs[score_idx].buf, (float *)score_sum,
+            validCount += process_fp32((float *)_outputs[box_idx].buf, (float *)_outputs[score_idx].buf, (float *)score_sum,
                                        grid_h, grid_w, stride, dfl_len, 
                                        filterBoxes, objProbs, classId, conf_threshold);
         }
-
+#endif
     }
 
     // no object detect
diff --git a/examples/yolov6/cpp/postprocess.h b/examples/yolov6/cpp/postprocess.h
index 900c4a1..63a677d 100644
--- a/examples/yolov6/cpp/postprocess.h
+++ b/examples/yolov6/cpp/postprocess.h
@@ -30,7 +30,7 @@ typedef struct {
 int init_post_process();
 void deinit_post_process();
 char *coco_cls_to_name(int cls_id);
-int post_process(rknn_app_context_t *app_ctx, rknn_output *outputs, letterbox_t *letter_box, float conf_threshold, float nms_threshold, object_detect_result_list *od_results);
+int post_process(rknn_app_context_t *app_ctx, void *outputs, letterbox_t *letter_box, float conf_threshold, float nms_threshold, object_detect_result_list *od_results);
 
 void deinitPostProcess();
 #endif //_RKNN_YOLOV6_DEMO_POSTPROCESS_H_
diff --git a/examples/yolov6/cpp/rknpu1/yolov6.cc b/examples/yolov6/cpp/rknpu1/yolov6.cc
new file mode 100644
index 0000000..a82876f
--- /dev/null
+++ b/examples/yolov6/cpp/rknpu1/yolov6.cc
@@ -0,0 +1,250 @@
+// Copyright (c) 2023 by Rockchip Electronics Co., Ltd. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+#include <math.h>
+
+#include "yolov6.h"
+#include "common.h"
+#include "file_utils.h"
+#include "image_utils.h"
+
+static void dump_tensor_attr(rknn_tensor_attr *attr)
+{
+    printf("  index=%d, name=%s, n_dims=%d, dims=[%d, %d, %d, %d], n_elems=%d, size=%d, fmt=%s, type=%s, qnt_type=%s, "
+           "zp=%d, scale=%f\n",
+           attr->index, attr->name, attr->n_dims, attr->dims[3], attr->dims[2], attr->dims[1], attr->dims[0],
+           attr->n_elems, attr->size, get_format_string(attr->fmt), get_type_string(attr->type),
+           get_qnt_type_string(attr->qnt_type), attr->zp, attr->scale);
+}
+
+int init_yolov6_model(const char *model_path, rknn_app_context_t *app_ctx)
+{
+    int ret;
+    int model_len = 0;
+    char *model;
+    rknn_context ctx = 0;
+
+    // Load RKNN Model
+    model_len = read_data_from_file(model_path, &model);
+    if (model == NULL)
+    {
+        printf("load_model fail!\n");
+        return -1;
+    }
+
+    ret = rknn_init(&ctx, model, model_len, 0);
+    free(model);
+    if (ret < 0)
+    {
+        printf("rknn_init fail! ret=%d\n", ret);
+        return -1;
+    }
+
+    // Get Model Input Output Number
+    rknn_input_output_num io_num;
+    ret = rknn_query(ctx, RKNN_QUERY_IN_OUT_NUM, &io_num, sizeof(io_num));
+    if (ret != RKNN_SUCC)
+    {
+        printf("rknn_query fail! ret=%d\n", ret);
+        return -1;
+    }
+    printf("model input num: %d, output num: %d\n", io_num.n_input, io_num.n_output);
+
+    // Get Model Input Info
+    printf("input tensors:\n");
+    rknn_tensor_attr input_attrs[io_num.n_input];
+    memset(input_attrs, 0, sizeof(input_attrs));
+    for (int i = 0; i < io_num.n_input; i++)
+    {
+        input_attrs[i].index = i;
+        ret = rknn_query(ctx, RKNN_QUERY_INPUT_ATTR, &(input_attrs[i]), sizeof(rknn_tensor_attr));
+        if (ret != RKNN_SUCC)
+        {
+            printf("rknn_query fail! ret=%d\n", ret);
+            return -1;
+        }
+        dump_tensor_attr(&(input_attrs[i]));
+    }
+
+    // Get Model Output Info
+    printf("output tensors:\n");
+    rknn_tensor_attr output_attrs[io_num.n_output];
+    memset(output_attrs, 0, sizeof(output_attrs));
+    for (int i = 0; i < io_num.n_output; i++)
+    {
+        output_attrs[i].index = i;
+        ret = rknn_query(ctx, RKNN_QUERY_OUTPUT_ATTR, &(output_attrs[i]), sizeof(rknn_tensor_attr));
+        if (ret != RKNN_SUCC)
+        {
+            printf("rknn_query fail! ret=%d\n", ret);
+            return -1;
+        }
+        dump_tensor_attr(&(output_attrs[i]));
+    }
+
+    // Set to context
+    app_ctx->rknn_ctx = ctx;
+
+    // TODO
+    if (output_attrs[0].qnt_type == RKNN_TENSOR_QNT_AFFINE_ASYMMETRIC && output_attrs[0].type == RKNN_TENSOR_UINT8)
+    {
+        app_ctx->is_quant = true;
+    }
+    else
+    {
+        app_ctx->is_quant = false;
+    }
+
+    app_ctx->io_num = io_num;
+    app_ctx->input_attrs = (rknn_tensor_attr *)malloc(io_num.n_input * sizeof(rknn_tensor_attr));
+    memcpy(app_ctx->input_attrs, input_attrs, io_num.n_input * sizeof(rknn_tensor_attr));
+    app_ctx->output_attrs = (rknn_tensor_attr *)malloc(io_num.n_output * sizeof(rknn_tensor_attr));
+    memcpy(app_ctx->output_attrs, output_attrs, io_num.n_output * sizeof(rknn_tensor_attr));
+
+    if (input_attrs[0].fmt == RKNN_TENSOR_NCHW)
+    {
+        printf("model is NCHW input fmt\n");
+        app_ctx->model_channel = input_attrs[0].dims[2];
+        app_ctx->model_height = input_attrs[0].dims[1];
+        app_ctx->model_width = input_attrs[0].dims[0];
+    }
+    else
+    {
+        printf("model is NHWC input fmt\n");
+        app_ctx->model_height = input_attrs[0].dims[2];
+        app_ctx->model_width = input_attrs[0].dims[1];
+        app_ctx->model_channel = input_attrs[0].dims[0];
+    }
+    printf("model input height=%d, width=%d, channel=%d\n",
+           app_ctx->model_height, app_ctx->model_width, app_ctx->model_channel);
+
+    return 0;
+}
+
+int release_yolov6_model(rknn_app_context_t *app_ctx)
+{
+    if (app_ctx->input_attrs != NULL)
+    {
+        free(app_ctx->input_attrs);
+        app_ctx->input_attrs = NULL;
+    }
+    if (app_ctx->output_attrs != NULL)
+    {
+        free(app_ctx->output_attrs);
+        app_ctx->output_attrs = NULL;
+    }
+    if (app_ctx->rknn_ctx != 0)
+    {
+        rknn_destroy(app_ctx->rknn_ctx);
+        app_ctx->rknn_ctx = 0;
+    }
+    return 0;
+}
+
+int inference_yolov6_model(rknn_app_context_t *app_ctx, image_buffer_t *img, object_detect_result_list *od_results)
+{
+    int ret;
+    image_buffer_t dst_img;
+    letterbox_t letter_box;
+    rknn_input inputs[app_ctx->io_num.n_input];
+    rknn_output outputs[app_ctx->io_num.n_output];
+    const float nms_threshold = NMS_THRESH;      // Default NMS threshold
+    const float box_conf_threshold = BOX_THRESH; // Default box threshold
+    int bg_color = 114;
+
+    if ((!app_ctx) || !(img) || (!od_results))
+    {
+        return -1;
+    }
+
+    memset(od_results, 0x00, sizeof(*od_results));
+    memset(&letter_box, 0, sizeof(letterbox_t));
+    memset(&dst_img, 0, sizeof(image_buffer_t));
+    memset(inputs, 0, sizeof(inputs));
+    memset(outputs, 0, sizeof(outputs));
+
+    // Pre Process
+    dst_img.width = app_ctx->model_width;
+    dst_img.height = app_ctx->model_height;
+    dst_img.format = IMAGE_FORMAT_RGB888;
+    dst_img.size = get_image_size(&dst_img);
+    dst_img.virt_addr = (unsigned char *)malloc(dst_img.size);
+    if (dst_img.virt_addr == NULL)
+    {
+        printf("malloc buffer size:%d fail!\n", dst_img.size);
+        return -1;
+    }
+
+    // letterbox
+    ret = convert_image_with_letterbox(img, &dst_img, &letter_box, bg_color);
+    if (ret < 0)
+    {
+        printf("convert_image_with_letterbox fail! ret=%d\n", ret);
+        return -1;
+    }
+
+    // Set Input Data
+    inputs[0].index = 0;
+    inputs[0].type = RKNN_TENSOR_UINT8;
+    inputs[0].fmt = RKNN_TENSOR_NHWC;
+    inputs[0].size = app_ctx->model_width * app_ctx->model_height * app_ctx->model_channel;
+    inputs[0].buf = dst_img.virt_addr;
+
+    ret = rknn_inputs_set(app_ctx->rknn_ctx, app_ctx->io_num.n_input, inputs);
+    if (ret < 0)
+    {
+        printf("rknn_input_set fail! ret=%d\n", ret);
+        return -1;
+    }
+
+    // Run
+    printf("rknn_run\n");
+    ret = rknn_run(app_ctx->rknn_ctx, nullptr);
+    if (ret < 0)
+    {
+        printf("rknn_run fail! ret=%d\n", ret);
+        return -1;
+    }
+
+    // Get Output
+    memset(outputs, 0, sizeof(outputs));
+    for (int i = 0; i < app_ctx->io_num.n_output; i++)
+    {
+        outputs[i].index = i;
+        outputs[i].want_float = (!app_ctx->is_quant);
+    }
+    ret = rknn_outputs_get(app_ctx->rknn_ctx, app_ctx->io_num.n_output, outputs, NULL);
+    if (ret < 0)
+    {
+        printf("rknn_outputs_get fail! ret=%d\n", ret);
+        goto out;
+    }
+
+    // Post Process
+    post_process(app_ctx, outputs, &letter_box, box_conf_threshold, nms_threshold, od_results);
+
+    // Remeber to release rknn output
+    rknn_outputs_release(app_ctx->rknn_ctx, app_ctx->io_num.n_output, outputs);
+
+out:
+    if (dst_img.virt_addr != NULL)
+    {
+        free(dst_img.virt_addr);
+    }
+
+    return ret;
+}
\ No newline at end of file
diff --git a/examples/yolov6/cpp/rknpu2/yolov6.cc b/examples/yolov6/cpp/rknpu2/yolov6.cc
index d880110..0234518 100644
--- a/examples/yolov6/cpp/rknpu2/yolov6.cc
+++ b/examples/yolov6/cpp/rknpu2/yolov6.cc
@@ -137,11 +137,6 @@ int init_yolov6_model(const char *model_path, rknn_app_context_t *app_ctx)
 
 int release_yolov6_model(rknn_app_context_t *app_ctx)
 {
-    if (app_ctx->rknn_ctx != 0)
-    {
-        rknn_destroy(app_ctx->rknn_ctx);
-        app_ctx->rknn_ctx = 0;
-    }
     if (app_ctx->input_attrs != NULL)
     {
         free(app_ctx->input_attrs);
@@ -152,6 +147,11 @@ int release_yolov6_model(rknn_app_context_t *app_ctx)
         free(app_ctx->output_attrs);
         app_ctx->output_attrs = NULL;
     }
+    if (app_ctx->rknn_ctx != 0)
+    {
+        rknn_destroy(app_ctx->rknn_ctx);
+        app_ctx->rknn_ctx = 0;
+    }
     return 0;
 }
 
diff --git a/examples/yolov6/cpp/rknpu2/yolov6_rv1106_1103.cc b/examples/yolov6/cpp/rknpu2/yolov6_rv1106_1103.cc
new file mode 100644
index 0000000..a440891
--- /dev/null
+++ b/examples/yolov6/cpp/rknpu2/yolov6_rv1106_1103.cc
@@ -0,0 +1,239 @@
+// Copyright (c) 2023 by Rockchip Electronics Co., Ltd. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+#include <math.h>
+
+#include "yolov6.h"
+#include "common.h"
+#include "file_utils.h"
+#include "image_utils.h"
+
+static void dump_tensor_attr(rknn_tensor_attr *attr)
+{
+    printf("  index=%d, name=%s, n_dims=%d, dims=[%d, %d, %d, %d], n_elems=%d, size=%d, fmt=%s, type=%s, qnt_type=%s, "
+           "zp=%d, scale=%f\n",
+           attr->index, attr->name, attr->n_dims, attr->dims[0], attr->dims[1], attr->dims[2], attr->dims[3],
+           attr->n_elems, attr->size, get_format_string(attr->fmt), get_type_string(attr->type),
+           get_qnt_type_string(attr->qnt_type), attr->zp, attr->scale);
+}
+
+int init_yolov6_model(const char *model_path, rknn_app_context_t *app_ctx)
+{
+    int ret;
+    int model_len = 0;
+    char *model;
+    rknn_context ctx = 0;
+
+    // Load RKNN Model
+    ret = rknn_init(&ctx, (char *)model_path, 0, 0, NULL);
+    if (ret < 0)
+    {
+        printf("rknn_init fail! ret=%d\n", ret);
+        return -1;
+    }
+
+    // Get Model Input Output Number
+    rknn_input_output_num io_num;
+    ret = rknn_query(ctx, RKNN_QUERY_IN_OUT_NUM, &io_num, sizeof(io_num));
+    if (ret != RKNN_SUCC)
+    {
+        printf("rknn_query fail! ret=%d\n", ret);
+        return -1;
+    }
+    printf("model input num: %d, output num: %d\n", io_num.n_input, io_num.n_output);
+
+    // Get Model Input Info
+    printf("input tensors:\n");
+    rknn_tensor_attr input_attrs[io_num.n_input];
+    memset(input_attrs, 0, sizeof(input_attrs));
+    for (int i = 0; i < io_num.n_input; i++)
+    {
+        input_attrs[i].index = i;
+        ret = rknn_query(ctx, RKNN_QUERY_NATIVE_INPUT_ATTR, &(input_attrs[i]), sizeof(rknn_tensor_attr));
+        if (ret != RKNN_SUCC)
+        {
+            printf("rknn_query fail! ret=%d\n", ret);
+            return -1;
+        }
+        dump_tensor_attr(&(input_attrs[i]));
+    }
+
+    // Get Model Output Info
+    printf("output tensors:\n");
+    rknn_tensor_attr output_attrs[io_num.n_output];
+    memset(output_attrs, 0, sizeof(output_attrs));
+    for (int i = 0; i < io_num.n_output; i++)
+    {
+        output_attrs[i].index = i;
+        ret = rknn_query(ctx, RKNN_QUERY_NATIVE_NHWC_OUTPUT_ATTR, &(output_attrs[i]), sizeof(rknn_tensor_attr));
+        if (ret != RKNN_SUCC)
+        {
+            printf("rknn_query fail! ret=%d\n", ret);
+            return -1;
+        }
+        dump_tensor_attr(&(output_attrs[i]));
+    }
+
+    // default input type is int8 (normalize and quantize need compute in outside)
+    // if set uint8, will fuse normalize and quantize to npu
+    input_attrs[0].type = RKNN_TENSOR_UINT8;
+    // default fmt is NHWC,1106 npu only support NHWC in zero copy mode
+    input_attrs[0].fmt = RKNN_TENSOR_NHWC;
+    printf("input_attrs[0].size_with_stride=%d\n", input_attrs[0].size_with_stride);
+    app_ctx->input_mems[0] = rknn_create_mem(ctx, input_attrs[0].size_with_stride);
+
+    // Set input tensor memory
+    ret = rknn_set_io_mem(ctx, app_ctx->input_mems[0], &input_attrs[0]);
+    if (ret < 0) {
+        printf("input_mems rknn_set_io_mem fail! ret=%d\n", ret);
+        return -1;
+    }
+
+    // Set output tensor memory
+    for (uint32_t i = 0; i < io_num.n_output; ++i) {
+        app_ctx->output_mems[i] = rknn_create_mem(ctx, output_attrs[i].size_with_stride);
+        ret = rknn_set_io_mem(ctx, app_ctx->output_mems[i], &output_attrs[i]);
+        if (ret < 0) {
+            printf("output_mems rknn_set_io_mem fail! ret=%d\n", ret);
+            return -1;
+        }
+    }
+
+    // Set to context
+    app_ctx->rknn_ctx = ctx;
+
+    // TODO
+    if (output_attrs[0].qnt_type == RKNN_TENSOR_QNT_AFFINE_ASYMMETRIC && output_attrs[0].type == RKNN_TENSOR_INT8)
+    {
+        app_ctx->is_quant = true;
+    }
+    else
+    {
+        app_ctx->is_quant = false;
+    }
+
+    app_ctx->io_num = io_num;
+    app_ctx->input_attrs = (rknn_tensor_attr *)malloc(io_num.n_input * sizeof(rknn_tensor_attr));
+    memcpy(app_ctx->input_attrs, input_attrs, io_num.n_input * sizeof(rknn_tensor_attr));
+    app_ctx->output_attrs = (rknn_tensor_attr *)malloc(io_num.n_output * sizeof(rknn_tensor_attr));
+    memcpy(app_ctx->output_attrs, output_attrs, io_num.n_output * sizeof(rknn_tensor_attr));
+
+    if (input_attrs[0].fmt == RKNN_TENSOR_NCHW)
+    {
+        printf("model is NCHW input fmt\n");
+        app_ctx->model_channel = input_attrs[0].dims[1];
+        app_ctx->model_height = input_attrs[0].dims[2];
+        app_ctx->model_width = input_attrs[0].dims[3];
+    }
+    else
+    {
+        printf("model is NHWC input fmt\n");
+        app_ctx->model_height = input_attrs[0].dims[1];
+        app_ctx->model_width = input_attrs[0].dims[2];
+        app_ctx->model_channel = input_attrs[0].dims[3];
+    }
+    printf("model input height=%d, width=%d, channel=%d\n",
+           app_ctx->model_height, app_ctx->model_width, app_ctx->model_channel);
+
+    return 0;
+}
+
+int release_yolov6_model(rknn_app_context_t *app_ctx)
+{    
+    if (app_ctx->input_attrs != NULL)
+    {
+        free(app_ctx->input_attrs);
+        app_ctx->input_attrs = NULL;
+    }
+    if (app_ctx->output_attrs != NULL)
+    {
+        free(app_ctx->output_attrs);
+        app_ctx->output_attrs = NULL;
+    }
+    for (int i = 0; i < app_ctx->io_num.n_input; i++) {
+        if (app_ctx->input_mems[i] != NULL) {
+            rknn_destroy_mem(app_ctx->rknn_ctx, app_ctx->input_mems[i]);
+        }
+    }
+    for (int i = 0; i < app_ctx->io_num.n_output; i++) {
+        if (app_ctx->output_mems[i] != NULL) {
+            rknn_destroy_mem(app_ctx->rknn_ctx, app_ctx->output_mems[i]);
+        }
+    }
+    if (app_ctx->rknn_ctx != 0)
+    {
+        rknn_destroy(app_ctx->rknn_ctx);
+        app_ctx->rknn_ctx = 0;
+    }
+    return 0;
+}
+
+int inference_yolov6_model(rknn_app_context_t *app_ctx, image_buffer_t *img, object_detect_result_list *od_results)
+{
+    int ret;
+    image_buffer_t dst_img;
+    letterbox_t letter_box;
+    rknn_input inputs[app_ctx->io_num.n_input];
+    rknn_output outputs[app_ctx->io_num.n_output];
+    const float nms_threshold = NMS_THRESH;      // 默认的NMS阈值
+    const float box_conf_threshold = BOX_THRESH; // 默认的置信度阈值
+    int bg_color = 114;
+
+    if ((!app_ctx) || !(img) || (!od_results))
+    {
+        return -1;
+    }
+
+    memset(od_results, 0x00, sizeof(*od_results));
+    memset(&letter_box, 0, sizeof(letterbox_t));
+    memset(&dst_img, 0, sizeof(image_buffer_t));
+
+    // Pre Process
+    dst_img.width = app_ctx->model_width;
+    dst_img.height = app_ctx->model_height;
+    dst_img.format = IMAGE_FORMAT_RGB888;
+    dst_img.size = get_image_size(&dst_img);
+    dst_img.fd = app_ctx->input_mems[0]->fd;
+    if (dst_img.virt_addr == NULL && dst_img.fd == 0)
+    {
+        printf("malloc buffer size:%d fail!\n", dst_img.size);
+        return -1;
+    }
+
+    // letterbox
+    ret = convert_image_with_letterbox(img, &dst_img, &letter_box, bg_color);
+    if (ret < 0)
+    {
+        printf("convert_image_with_letterbox fail! ret=%d\n", ret);
+        return -1;
+    }
+
+    // Run
+    printf("rknn_run\n");
+    ret = rknn_run(app_ctx->rknn_ctx, nullptr);
+    if (ret < 0)
+    {
+        printf("rknn_run fail! ret=%d\n", ret);
+        return -1;
+    }
+
+    // Post Process
+    post_process(app_ctx, app_ctx->output_mems, &letter_box, box_conf_threshold, nms_threshold, od_results);
+
+out:
+    return ret;
+}
\ No newline at end of file
diff --git a/examples/yolov6/cpp/yolov6.h b/examples/yolov6/cpp/yolov6.h
index 798ff52..e35fa99 100644
--- a/examples/yolov6/cpp/yolov6.h
+++ b/examples/yolov6/cpp/yolov6.h
@@ -18,12 +18,24 @@
 
 #include "rknn_api.h"
 #include "common.h"
+#if defined(RV1106_1103) 
+    typedef struct {
+        char *dma_buf_virt_addr;
+        int dma_buf_fd;
+        int size;
+    }rknn_dma_buf;
+#endif
 
 typedef struct {
     rknn_context rknn_ctx;
     rknn_input_output_num io_num;
     rknn_tensor_attr* input_attrs;
     rknn_tensor_attr* output_attrs;
+#if defined(RV1106_1103) 
+    rknn_tensor_mem* input_mems[1];
+    rknn_tensor_mem* output_mems[9];
+    rknn_dma_buf img_dma_buf;
+#endif
     int model_channel;
     int model_width;
     int model_height;
diff --git a/examples/yolov6/model_comparison/yolov6_graph_comparison.jpg b/examples/yolov6/model_comparison/yolov6_graph_comparison.jpg
new file mode 100644
index 0000000..054ee89
Binary files /dev/null and b/examples/yolov6/model_comparison/yolov6_graph_comparison.jpg differ
diff --git a/examples/yolov6/model_comparison/yolov6_output_comparison.jpg b/examples/yolov6/model_comparison/yolov6_output_comparison.jpg
new file mode 100644
index 0000000..4d8b954
Binary files /dev/null and b/examples/yolov6/model_comparison/yolov6_output_comparison.jpg differ
diff --git a/examples/yolov6/python/convert.py b/examples/yolov6/python/convert.py
index 0c233a7..05673ed 100644
--- a/examples/yolov6/python/convert.py
+++ b/examples/yolov6/python/convert.py
@@ -1,6 +1,4 @@
-import os
 import sys
-import numpy as np
 from rknn.api import RKNN
 
 DATASET_PATH = '../../../datasets/COCO/coco_subset_20.txt'
@@ -10,8 +8,9 @@
 def parse_arg():
     if len(sys.argv) < 3:
         print("Usage: python3 {} onnx_model_path [platform] [dtype(optional)] [output_rknn_path(optional)]".format(sys.argv[0]));
-        print("       platform choose from [rk3562,rk3566,rk3568,rk3588]")
-        print("       dtype choose from    [i8, fp]")
+        print("       platform choose from [rk3562,rk3566,rk3568,rk3588,rk1808,rv1109,rv1126]")
+        print("       dtype choose from [i8, fp] for [rk3562,rk3566,rk3568,rk3588]")
+        print("       dtype choose from [u8, fp] for [rk1808,rv1109,rv1126]")
         exit(1)
 
     model_path = sys.argv[1]
@@ -20,10 +19,10 @@ def parse_arg():
     do_quant = DEFAULT_QUANT
     if len(sys.argv) > 3:
         model_type = sys.argv[3]
-        if model_type not in ['i8', 'fp']:
+        if model_type not in ['i8', 'u8', 'fp']:
             print("ERROR: Invalid model type: {}".format(model_type))
             exit(1)
-        elif model_type == 'i8':
+        elif model_type in ['i8', 'u8']:
             do_quant = True
         else:
             do_quant = False
diff --git a/examples/yolov7/README.md b/examples/yolov7/README.md
index 13a9aee..9325930 100644
--- a/examples/yolov7/README.md
+++ b/examples/yolov7/README.md
@@ -29,7 +29,7 @@ https://github.com/airockchip/yolov7
 
 ## 2. Current Support Platform
 
-RK3566, RK3568, RK3588, RK3562
+RK3566, RK3568, RK3588, RK3562, RK1808, RV1109, RV1126
 
 
 
@@ -46,6 +46,18 @@ cd model
 ./download_model.sh
 ```
 
+**Note**: The model provided here is an optimized model, which is different from the official original model. Take yolov7-tiny.onnx as an example to show the difference between them.
+1. The comparison of their output information is as follows. The left is the official original model, and the right is the optimized model. The three colored boxes in the figure represent the changes of the three outputs.
+
+<div align=center>
+  <img src="./model_comparison/yolov7_output_comparison.jpg" alt="Image">
+</div>
+
+2. Taking the output change [1,3,20,20,85]->[1,255,20,20] as an example, we remove the subgraphs behind the convolution node in the model (the framed part in the figure), and keep the output of the convolution ([1,255,20,20]).
+
+<div align=center>
+  <img src="./model_comparison/yolov7_graph_comparison.jpg" alt="Image">
+</div>
 
 
 ## 4. Convert to RKNN
@@ -65,7 +77,7 @@ python convert.py ../model/yolov7-tiny.onnx rk3588
 
 - `<onnx_model>`: Specify ONNX model path.
 - `<TARGET_PLATFORM>`: Specify NPU platform name. Support Platform refer [here](#2 Current Support Platform).
-- `<dtype>(optional)`: Specify as `i8` or `fp`. `i8` for doing quantization, `fp` for no quantization. Default is `i8`.
+- `<dtype>(optional)`: Specify as `i8`, `u8` or `fp`. `i8`/`u8` for doing quantization, `fp` for no quantization. Default is `i8`.
 - `<output_rknn_path>(optional)`: Specify save path for the RKNN model, default save in the same directory as ONNX model with name `yolov7.rknn`
 
 
@@ -93,29 +105,12 @@ python yolov7.py --model_path <rknn_model> --target <TARGET_PLATFORM> --img_show
 
 ## 6. Android Demo
 
-#### 6.1 Compile and Build
-
-*Usage:*
-
-```sh
-# go back to the rknn_model_zoo root directory
-cd ../../
-export ANDROID_NDK_PATH=<android_ndk_path>
-
-./build-android.sh -t <TARGET_PLATFORM> -a <ARCH> -d yolov7
+**Note: RK1808, RV1109, RV1126 does not support Android.**
 
-# such as 
-./build-android.sh -t rk3588 -a arm64-v8a -d yolov7
-```
+#### 6.1 Compile and Build
 
-*Description:*
-- `<android_ndk_path>`: Specify Android NDK path.
-- `<TARGET_PLATFORM>`: Specify NPU platform name. Support Platform refer [here](#2 Current Support Platform).
-- `<ARCH>`: Specify device system architecture. To query device architecture, refer to the following command:
-	```shell
-	# Query architecture. For Android, ['arm64-v8a' or 'armeabi-v7a'] should shown in log.
-	adb shell cat /proc/version
-	```
+Please refer to the [Compilation_Environment_Setup_Guide](../../docs/Compilation_Environment_Setup_Guide.md#android-platform) document to setup a cross-compilation environment and complete the compilation of C/C++ Demo.  
+**Note: Please replace the model name with `yolov7`.**
 
 #### 6.2 Push demo files to device
 
@@ -151,31 +146,8 @@ export LD_LIBRARY_PATH=./lib
 
 #### 7.1 Compile and Build
 
-*Usage:*
-
-```shell
-# go back to the rknn_model_zoo root directory
-cd ../../
-
-# if GCC_COMPILER not found while building, please set GCC_COMPILER path
-(optional)export GCC_COMPILER=<GCC_COMPILER_PATH>
-
-./build-linux.sh -t <TARGET_PLATFORM> -a <ARCH> -d yolov7
-
-# such as 
-./build-linux.sh -t rk3588 -a aarch64 -d yolov7
-```
-
-*Description:*
-
-- `<GCC_COMPILER_PATH>`: Specified as GCC_COMPILER path.
-- `<TARGET_PLATFORM>` : Specify NPU platform name. Support Platform refer [here](#2 Current Support Platform).
-- `<ARCH>`: Specify device system architecture. To query device architecture, refer to the following command: 
-  
-  ```shell
-  # Query architecture. For Linux, ['aarch64' or 'armhf'] should shown in log.
-  adb shell cat /proc/version
-  ```
+Please refer to the [Compilation_Environment_Setup_Guide](../../docs/Compilation_Environment_Setup_Guide.md#linux-platform) document to setup a cross-compilation environment and complete the compilation of C/C++ Demo.  
+**Note: Please replace the model name with `yolov7`.**
 
 #### 7.2 Push demo files to device
 
@@ -221,4 +193,4 @@ person @ (79 330 124 524) 0.346
 
 <img src="result.png">
 
-- Note: Different platforms, different versions of tools and drivers may have slightly different results.
\ No newline at end of file
+- Note: Different platforms, different versions of tools and drivers may have slightly different results.
diff --git a/examples/yolov7/cpp/CMakeLists.txt b/examples/yolov7/cpp/CMakeLists.txt
index 3b408e0..aa4cfc6 100644
--- a/examples/yolov7/cpp/CMakeLists.txt
+++ b/examples/yolov7/cpp/CMakeLists.txt
@@ -9,6 +9,14 @@ if (ENABLE_ASAN)
 	set (CMAKE_LINKER_FLAGS_DEBUG "${CMAKE_LINKER_FLAGS_DEBUG} -fno-omit-frame-pointer -fsanitize=address")
 endif ()
 
+set(rknpu2_yolov7_file rknpu2/yolov7.cc)
+if (TARGET_SOC STREQUAL "rv1106" OR TARGET_SOC STREQUAL "rv1103")
+    add_definitions(-DRV1106_1103)
+    set(rknpu2_yolov7_file rknpu2/yolov7_rv1106_1103.cc)
+    #dma
+    include_directories(${CMAKE_CURRENT_SOURCE_DIR}/../../../3rdparty/allocator/dma)
+endif()
+
 add_subdirectory(${CMAKE_CURRENT_SOURCE_DIR}/../../../3rdparty/ 3rdparty.out)
 add_subdirectory(${CMAKE_CURRENT_SOURCE_DIR}/../../../utils/ utils.out)
 
@@ -16,17 +24,32 @@ set(CMAKE_INSTALL_RPATH "$ORIGIN/lib")
 
 file(GLOB SRCS ${CMAKE_CURRENT_SOURCE_DIR}/*.cc)
 
+set(rknpu_yolov7_file rknpu2/yolov7.cc)
+
+if(TARGET_SOC STREQUAL "rv1106" OR TARGET_SOC STREQUAL "rv1103")
+    add_definitions(-DRV1106_1103)
+    set(rknpu_yolov7_file rknpu2/yolov7_rv1106_1103.cc)
+    # dma
+    include_directories(${CMAKE_CURRENT_SOURCE_DIR}/../../../3rdparty/allocator/dma)
+endif()
+
+if(TARGET_SOC STREQUAL "rk1808" OR TARGET_SOC STREQUAL "rv1109" OR TARGET_SOC STREQUAL "rv1126")
+    add_definitions(-DRKNPU1)
+    set(rknpu_yolov7_file rknpu1/yolov7.cc)
+endif()
+
 add_executable(${PROJECT_NAME}
     main.cc
     postprocess.cc
-    rknpu2/yolov7.cc
+    ${rknpu_yolov7_file}
 )
 
 target_link_libraries(${PROJECT_NAME}
-    fileutils
     imageutils
+    fileutils
     imagedrawing    
     ${LIBRKNNRT}
+    dl
 )
 
 if (CMAKE_SYSTEM_NAME STREQUAL "Android")
diff --git a/examples/yolov7/cpp/main.cc b/examples/yolov7/cpp/main.cc
index 7b464d9..ab3f8ba 100644
--- a/examples/yolov7/cpp/main.cc
+++ b/examples/yolov7/cpp/main.cc
@@ -26,6 +26,10 @@
 #include "image_drawing.h"
 #include "easy_timer.h"
 
+#if defined(RV1106_1103) 
+    #include "dma_alloc.hpp"
+#endif
+
 /*-------------------------------------------
                   Main Function
 -------------------------------------------*/
@@ -63,6 +67,19 @@ int main(int argc, char **argv)
     image_buffer_t src_image;
     memset(&src_image, 0, sizeof(image_buffer_t));
     ret = read_image(image_path, &src_image);
+
+#if defined(RV1106_1103) 
+    //RV1106 rga requires that input and output bufs are memory allocated by dma
+    ret = dma_buf_alloc(RV1106_CMA_HEAP_PATH, src_image.size, &rknn_app_ctx.img_dma_buf.dma_buf_fd, 
+                       (void **) & (rknn_app_ctx.img_dma_buf.dma_buf_virt_addr));
+    memcpy(rknn_app_ctx.img_dma_buf.dma_buf_virt_addr, src_image.virt_addr, src_image.size);
+    dma_sync_cpu_to_device(rknn_app_ctx.img_dma_buf.dma_buf_fd);
+    free(src_image.virt_addr);
+    src_image.virt_addr = (unsigned char *)rknn_app_ctx.img_dma_buf.dma_buf_virt_addr;
+    src_image.fd = rknn_app_ctx.img_dma_buf.dma_buf_fd;
+    rknn_app_ctx.img_dma_buf.size = src_image.size;
+#endif
+
     if (ret != 0)
     {
         printf("read image fail! ret=%d image_path=%s\n", ret, image_path);
@@ -114,7 +131,12 @@ int main(int argc, char **argv)
 
     if (src_image.virt_addr != NULL)
     {
+#if defined(RV1106_1103) 
+        dma_buf_free(rknn_app_ctx.img_dma_buf.size, &rknn_app_ctx.img_dma_buf.dma_buf_fd, 
+                rknn_app_ctx.img_dma_buf.dma_buf_virt_addr);
+#else
         free(src_image.virt_addr);
+#endif
     }
 
     return 0;
diff --git a/examples/yolov7/cpp/postprocess.cc b/examples/yolov7/cpp/postprocess.cc
index 6718c46..c875e86 100644
--- a/examples/yolov7/cpp/postprocess.cc
+++ b/examples/yolov7/cpp/postprocess.cc
@@ -198,8 +198,74 @@ static int8_t qnt_f32_to_affine(float f32, int32_t zp, float scale)
     return res;
 }
 
+static uint8_t qnt_f32_to_affine_u8(float f32, int32_t zp, float scale)
+{
+    float dst_val = (f32 / scale) + zp;
+    uint8_t res = (uint8_t)__clip(dst_val, 0, 255);
+    return res;
+}
+
 static float deqnt_affine_to_f32(int8_t qnt, int32_t zp, float scale) { return ((float)qnt - (float)zp) * scale; }
 
+static float deqnt_affine_u8_to_f32(uint8_t qnt, int32_t zp, float scale) { return ((float)qnt - (float)zp) * scale; }
+
+static int process_u8(uint8_t *input, int *anchor, int grid_h, int grid_w, int height, int width, int stride,
+                      std::vector<float> &boxes, std::vector<float> &objProbs, std::vector<int> &classId, float threshold,
+                      int32_t zp, float scale)
+{
+    int validCount = 0;
+    int grid_len = grid_h * grid_w;
+    uint8_t thres_u8 = qnt_f32_to_affine_u8(threshold, zp, scale);
+    for (int a = 0; a < 3; a++)
+    {
+        for (int i = 0; i < grid_h; i++)
+        {
+            for (int j = 0; j < grid_w; j++)
+            {
+                uint8_t box_confidence = input[(PROP_BOX_SIZE * a + 4) * grid_len + i * grid_w + j];
+                if (box_confidence >= thres_u8)
+                {
+                    int offset = (PROP_BOX_SIZE * a) * grid_len + i * grid_w + j;
+                    uint8_t *in_ptr = input + offset;
+                    float box_x = (deqnt_affine_u8_to_f32(*in_ptr, zp, scale)) * 2.0 - 0.5;
+                    float box_y = (deqnt_affine_u8_to_f32(in_ptr[grid_len], zp, scale)) * 2.0 - 0.5;
+                    float box_w = (deqnt_affine_u8_to_f32(in_ptr[2 * grid_len], zp, scale)) * 2.0;
+                    float box_h = (deqnt_affine_u8_to_f32(in_ptr[3 * grid_len], zp, scale)) * 2.0;
+                    box_x = (box_x + j) * (float)stride;
+                    box_y = (box_y + i) * (float)stride;
+                    box_w = box_w * box_w * (float)anchor[a * 2];
+                    box_h = box_h * box_h * (float)anchor[a * 2 + 1];
+                    box_x -= (box_w / 2.0);
+                    box_y -= (box_h / 2.0);
+
+                    uint8_t maxClassProbs = in_ptr[5 * grid_len];
+                    int maxClassId = 0;
+                    for (int k = 1; k < OBJ_CLASS_NUM; ++k)
+                    {
+                        uint8_t prob = in_ptr[(5 + k) * grid_len];
+                        if (prob > maxClassProbs)
+                        {
+                            maxClassId = k;
+                            maxClassProbs = prob;
+                        }
+                    }
+                    if (maxClassProbs > thres_u8)
+                    {
+                        objProbs.push_back((deqnt_affine_u8_to_f32(maxClassProbs, zp, scale)) * (deqnt_affine_u8_to_f32(box_confidence, zp, scale)));
+                        classId.push_back(maxClassId);
+                        validCount++;
+                        boxes.push_back(box_x);
+                        boxes.push_back(box_y);
+                        boxes.push_back(box_w);
+                        boxes.push_back(box_h);
+                    }
+                }
+            }
+        }
+    }
+    return validCount;
+}
+
 static int process_i8(int8_t *input, int *anchor, int grid_h, int grid_w, int height, int width, int stride,
                       std::vector<float> &boxes, std::vector<float> &objProbs, std::vector<int> &classId, float threshold,
                       int32_t zp, float scale)
@@ -257,6 +323,71 @@ static int process_i8(int8_t *input, int *anchor, int grid_h, int grid_w, int he
     return validCount;
 }
 
+static int process_i8_rv1106(int8_t *input, int *anchor, int grid_h, int grid_w, int height, int width, int stride,
+                      std::vector<float> &boxes, std::vector<float> &boxScores, std::vector<int> &classId, float threshold,
+                      int32_t zp, float scale) {
+    int validCount = 0;
+    int8_t thres_i8 = qnt_f32_to_affine(threshold, zp, scale);
+
+    int anchor_per_branch = 3;
+    int align_c = PROP_BOX_SIZE * anchor_per_branch;
+
+    for (int h = 0; h < grid_h; h++) {
+        for (int w = 0; w < grid_w; w++) {
+            for (int a = 0; a < anchor_per_branch; a++) {
+                int hw_offset = h * grid_w * align_c + w * align_c + a * PROP_BOX_SIZE;
+                int8_t *hw_ptr = input + hw_offset;
+                int8_t box_confidence = hw_ptr[4];
+
+                if (box_confidence >= thres_i8) {
+                    int8_t maxClassProbs = hw_ptr[5];
+                    int maxClassId = 0;
+                    for (int k = 1; k < OBJ_CLASS_NUM; ++k) {
+                        int8_t prob = hw_ptr[5 + k];
+                        if (prob > maxClassProbs) {
+                            maxClassId = k;
+                            maxClassProbs = prob;
+                        }
+                    }
+
+                    float box_conf_f32 = deqnt_affine_to_f32(box_confidence, zp, scale);
+                    float class_prob_f32 = deqnt_affine_to_f32(maxClassProbs, zp, scale);
+                    float limit_score = box_conf_f32 * class_prob_f32;
+
+                    if (limit_score > threshold) {
+                        float box_x, box_y, box_w, box_h;
+
+                        box_x = deqnt_affine_to_f32(hw_ptr[0], zp, scale) * 2.0 - 0.5;
+                        box_y = deqnt_affine_to_f32(hw_ptr[1], zp, scale) * 2.0 - 0.5;
+                        box_w = deqnt_affine_to_f32(hw_ptr[2], zp, scale) * 2.0;
+                        box_h = deqnt_affine_to_f32(hw_ptr[3], zp, scale) * 2.0;
+                        box_w = box_w * box_w;
+                        box_h = box_h * box_h;
+
+
+                        box_x = (box_x + w) * (float)stride;
+                        box_y = (box_y + h) * (float)stride;
+                        box_w *= (float)anchor[a * 2];
+                        box_h *= (float)anchor[a * 2 + 1];
+
+                        box_x -= (box_w / 2.0);
+                        box_y -= (box_h / 2.0);
+
+                        boxes.push_back(box_x);
+                        boxes.push_back(box_y);
+                        boxes.push_back(box_w);
+                        boxes.push_back(box_h);
+                        boxScores.push_back(limit_score);
+                        classId.push_back(maxClassId);
+                        validCount++;
+                    }
+                }
+            }
+        }
+    }
+    return validCount;
+}
+
 static int process_fp32(float *input, int *anchor, int grid_h, int grid_w, int height, int width, int stride,
                         std::vector<float> &boxes, std::vector<float> &objProbs, std::vector<int> &classId, float threshold)
 {
@@ -313,8 +444,13 @@ static int process_fp32(float *input, int *anchor, int grid_h, int grid_w, int h
     return validCount;
 }
 
-int post_process(rknn_app_context_t *app_ctx, rknn_output *outputs, letterbox_t *letter_box, float conf_threshold, float nms_threshold, object_detect_result_list *od_results)
+int post_process(rknn_app_context_t *app_ctx, void *outputs, letterbox_t *letter_box, float conf_threshold, float nms_threshold, object_detect_result_list *od_results)
 {
+#if defined(RV1106_1103) 
+    rknn_tensor_mem **_outputs = (rknn_tensor_mem **)outputs;
+#else
+    rknn_output *_outputs = (rknn_output *)outputs;
+#endif
     std::vector<float> filterBoxes;
     std::vector<float> objProbs;
     std::vector<int> classId;
@@ -329,20 +465,45 @@ int post_process(rknn_app_context_t *app_ctx, rknn_output *outputs, letterbox_t
 
     for (int i = 0; i < 3; i++)
     {
+#if defined(RV1106_1103) 
+        grid_h = app_ctx->output_attrs[i].dims[1];
+        grid_w = app_ctx->output_attrs[i].dims[2];
+        stride = model_in_h / grid_h;
+        //RV1106 only support i8
+        if (app_ctx->is_quant) {
+            validCount += process_i8_rv1106((int8_t *)(_outputs[i]->virt_addr), (int *)anchor[i], grid_h, grid_w, model_in_h, model_in_w, stride, filterBoxes, objProbs,
+                                     classId, conf_threshold, app_ctx->output_attrs[i].zp, app_ctx->output_attrs[i].scale);
+        }
+#elif defined(RKNPU1)
+        // NCHW reversed in dims: WHCN
+        grid_h = app_ctx->output_attrs[i].dims[1];
+        grid_w = app_ctx->output_attrs[i].dims[0];
+        stride = model_in_h / grid_h;
+
+        if (app_ctx->is_quant)
+        {
+            validCount += process_u8((uint8_t *)_outputs[i].buf, (int *)anchor[i], grid_h, grid_w, model_in_h, model_in_w, stride, filterBoxes, objProbs,
+                                     classId, conf_threshold, app_ctx->output_attrs[i].zp, app_ctx->output_attrs[i].scale);
+        } else {
+            validCount += process_fp32((float *)_outputs[i].buf, (int *)anchor[i], grid_h, grid_w, model_in_h, model_in_w, stride, filterBoxes, objProbs,
+                                       classId, conf_threshold);
+        }
+#else
         grid_h = app_ctx->output_attrs[i].dims[2];
         grid_w = app_ctx->output_attrs[i].dims[3];
         stride = model_in_h / grid_h;
 
         if (app_ctx->is_quant)
         {
-            validCount += process_i8((int8_t *)outputs[i].buf, (int *)anchor[i], grid_h, grid_w, model_in_h, model_in_w, stride, filterBoxes, objProbs,
+            validCount += process_i8((int8_t *)_outputs[i].buf, (int *)anchor[i], grid_h, grid_w, model_in_h, model_in_w, stride, filterBoxes, objProbs,
                                      classId, conf_threshold, app_ctx->output_attrs[i].zp, app_ctx->output_attrs[i].scale);
         }
         else
         {
-            validCount += process_fp32((float *)outputs[i].buf, (int *)anchor[i], grid_h, grid_w, model_in_h, model_in_w, stride, filterBoxes, objProbs,
+            validCount += process_fp32((float *)_outputs[i].buf, (int *)anchor[i], grid_h, grid_w, model_in_h, model_in_w, stride, filterBoxes, objProbs,
                                        classId, conf_threshold);
         }
+#endif
     }
 
     // no object detect
diff --git a/examples/yolov7/cpp/postprocess.h b/examples/yolov7/cpp/postprocess.h
index 8ecd02f..962d235 100644
--- a/examples/yolov7/cpp/postprocess.h
+++ b/examples/yolov7/cpp/postprocess.h
@@ -31,7 +31,7 @@ typedef struct {
 int init_post_process();
 void deinit_post_process();
 char *coco_cls_to_name(int cls_id);
-int post_process(rknn_app_context_t *app_ctx, rknn_output *outputs, letterbox_t *letter_box, float conf_threshold, float nms_threshold, object_detect_result_list *od_results);
+int post_process(rknn_app_context_t *app_ctx, void *outputs, letterbox_t *letter_box, float conf_threshold, float nms_threshold, object_detect_result_list *od_results);
 
 void deinitPostProcess();
 #endif //_RKNN_YOLOV5_DEMO_POSTPROCESS_H_
diff --git a/examples/yolov7/cpp/rknpu1/yolov7.cc b/examples/yolov7/cpp/rknpu1/yolov7.cc
new file mode 100644
index 0000000..330b5f5
--- /dev/null
+++ b/examples/yolov7/cpp/rknpu1/yolov7.cc
@@ -0,0 +1,268 @@
+// Copyright (c) 2023 by Rockchip Electronics Co., Ltd. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+#include <math.h>
+
+#include "yolov7.h"
+#include "common.h"
+#include "file_utils.h"
+#include "image_utils.h"
+#include "easy_timer.h"
+
+static void dump_tensor_attr(rknn_tensor_attr *attr)
+{
+    printf("  index=%d, name=%s, n_dims=%d, dims=[%d, %d, %d, %d], n_elems=%d, size=%d, fmt=%s, type=%s, qnt_type=%s, "
+           "zp=%d, scale=%f\n",
+           attr->index, attr->name, attr->n_dims, attr->dims[3], attr->dims[2], attr->dims[1], attr->dims[0],
+           attr->n_elems, attr->size, get_format_string(attr->fmt), get_type_string(attr->type),
+           get_qnt_type_string(attr->qnt_type), attr->zp, attr->scale);
+}
+
+int init_yolov7_model(const char *model_path, rknn_app_context_t *app_ctx)
+{
+    int ret;
+    int model_len = 0;
+    char *model;
+    rknn_context ctx = 0;
+
+    // Load RKNN Model
+    model_len = read_data_from_file(model_path, &model);
+    if (model == NULL)
+    {
+        printf("load_model fail!\n");
+        return -1;
+    }
+
+    ret = rknn_init(&ctx, model, model_len, 0);
+    free(model);
+    if (ret < 0)
+    {
+        printf("rknn_init fail! ret=%d\n", ret);
+        return -1;
+    }
+
+    // Get Model Input Output Number
+    rknn_input_output_num io_num;
+    ret = rknn_query(ctx, RKNN_QUERY_IN_OUT_NUM, &io_num, sizeof(io_num));
+    if (ret != RKNN_SUCC)
+    {
+        printf("rknn_query fail! ret=%d\n", ret);
+        return -1;
+    }
+    printf("model input num: %d, output num: %d\n", io_num.n_input, io_num.n_output);
+
+    // Get Model Input Info
+    printf("input tensors:\n");
+    rknn_tensor_attr input_attrs[io_num.n_input];
+    memset(input_attrs, 0, sizeof(input_attrs));
+    for (int i = 0; i < io_num.n_input; i++)
+    {
+        input_attrs[i].index = i;
+        ret = rknn_query(ctx, RKNN_QUERY_INPUT_ATTR, &(input_attrs[i]), sizeof(rknn_tensor_attr));
+        if (ret != RKNN_SUCC)
+        {
+            printf("rknn_query fail! ret=%d\n", ret);
+            return -1;
+        }
+        dump_tensor_attr(&(input_attrs[i]));
+    }
+
+    // Get Model Output Info
+    printf("output tensors:\n");
+    rknn_tensor_attr output_attrs[io_num.n_output];
+    memset(output_attrs, 0, sizeof(output_attrs));
+    for (int i = 0; i < io_num.n_output; i++)
+    {
+        output_attrs[i].index = i;
+        ret = rknn_query(ctx, RKNN_QUERY_OUTPUT_ATTR, &(output_attrs[i]), sizeof(rknn_tensor_attr));
+        if (ret != RKNN_SUCC)
+        {
+            printf("rknn_query fail! ret=%d\n", ret);
+            return -1;
+        }
+        dump_tensor_attr(&(output_attrs[i]));
+    }
+
+    // Set to context
+    app_ctx->rknn_ctx = ctx;
+
+    // TODO
+    if (output_attrs[0].qnt_type == RKNN_TENSOR_QNT_AFFINE_ASYMMETRIC && output_attrs[0].type != RKNN_TENSOR_FLOAT16)
+    {
+        app_ctx->is_quant = true;
+    }
+    else
+    {
+        app_ctx->is_quant = false;
+    }
+
+    app_ctx->io_num = io_num;
+    app_ctx->input_attrs = (rknn_tensor_attr *)malloc(io_num.n_input * sizeof(rknn_tensor_attr));
+    memcpy(app_ctx->input_attrs, input_attrs, io_num.n_input * sizeof(rknn_tensor_attr));
+    app_ctx->output_attrs = (rknn_tensor_attr *)malloc(io_num.n_output * sizeof(rknn_tensor_attr));
+    memcpy(app_ctx->output_attrs, output_attrs, io_num.n_output * sizeof(rknn_tensor_attr));
+
+    if (input_attrs[0].fmt == RKNN_TENSOR_NCHW)
+    {
+        printf("model is NCHW input fmt\n");
+        app_ctx->model_channel = input_attrs[0].dims[2];
+        app_ctx->model_height = input_attrs[0].dims[1];
+        app_ctx->model_width = input_attrs[0].dims[0];
+    }
+    else
+    {
+        printf("model is NHWC input fmt\n");
+        app_ctx->model_height = input_attrs[0].dims[2];
+        app_ctx->model_width = input_attrs[0].dims[1];
+        app_ctx->model_channel = input_attrs[0].dims[0];
+    }
+    printf("model input height=%d, width=%d, channel=%d\n",
+           app_ctx->model_height, app_ctx->model_width, app_ctx->model_channel);
+
+    return 0;
+}
+
+int release_yolov7_model(rknn_app_context_t *app_ctx)
+{
+    if (app_ctx->input_attrs != NULL)
+    {
+        free(app_ctx->input_attrs);
+        app_ctx->input_attrs = NULL;
+    }
+    if (app_ctx->output_attrs != NULL)
+    {
+        free(app_ctx->output_attrs);
+        app_ctx->output_attrs = NULL;
+    }
+    if (app_ctx->rknn_ctx != 0)
+    {
+        rknn_destroy(app_ctx->rknn_ctx);
+        app_ctx->rknn_ctx = 0;
+    }
+    return 0;
+}
+
+int inference_yolov7_model(rknn_app_context_t *app_ctx, image_buffer_t *img, object_detect_result_list *od_results)
+{
+    int ret;
+    image_buffer_t dst_img;
+    letterbox_t letter_box;
+    rknn_input inputs[app_ctx->io_num.n_input];
+    rknn_output outputs[app_ctx->io_num.n_output];
+    const float nms_threshold = NMS_THRESH;      // Default NMS threshold
+    const float box_conf_threshold = BOX_THRESH; // Default box threshold
+    int bg_color = 114;
+    TIMER timer;
+    timer.indent_set("");
+
+    if ((!app_ctx) || !(img) || (!od_results))
+    {
+        return -1;
+    }
+
+    memset(od_results, 0x00, sizeof(*od_results));
+    memset(&letter_box, 0, sizeof(letterbox_t));
+    memset(&dst_img, 0, sizeof(image_buffer_t));
+    memset(inputs, 0, sizeof(inputs));
+    memset(outputs, 0, sizeof(outputs));
+
+    // Pre Process
+    dst_img.width = app_ctx->model_width;
+    dst_img.height = app_ctx->model_height;
+    dst_img.format = IMAGE_FORMAT_RGB888;
+    dst_img.size = get_image_size(&dst_img);
+    dst_img.virt_addr = (unsigned char *)malloc(dst_img.size);
+    if (dst_img.virt_addr == NULL)
+    {
+        printf("malloc buffer size:%d fail!\n", dst_img.size);
+        return -1;
+    }
+
+    // letterbox
+    timer.tik();
+    ret = convert_image_with_letterbox(img, &dst_img, &letter_box, bg_color);
+    if (ret < 0)
+    {
+        printf("convert_image_with_letterbox fail! ret=%d\n", ret);
+        return -1;
+    }
+    timer.tok();
+    timer.print_time("convert_image_with_letterbox");
+
+    // Set Input Data
+    inputs[0].index = 0;
+    inputs[0].type = RKNN_TENSOR_UINT8;
+    inputs[0].fmt = RKNN_TENSOR_NHWC;
+    inputs[0].size = app_ctx->model_width * app_ctx->model_height * app_ctx->model_channel;
+    inputs[0].buf = dst_img.virt_addr;
+
+    timer.tik();
+    ret = rknn_inputs_set(app_ctx->rknn_ctx, app_ctx->io_num.n_input, inputs);
+    if (ret < 0)
+    {
+        printf("rknn_input_set fail! ret=%d\n", ret);
+        return -1;
+    }
+    timer.tok();
+    timer.print_time("rknn_inputs_set");
+
+    // Run
+    timer.tik();
+    ret = rknn_run(app_ctx->rknn_ctx, nullptr);
+    if (ret < 0)
+    {
+        printf("rknn_run fail! ret=%d\n", ret);
+        return -1;
+    }
+    timer.tok();
+    timer.print_time("rknn_run");
+
+    // Get Output
+    memset(outputs, 0, sizeof(outputs));
+    for (int i = 0; i < app_ctx->io_num.n_output; i++)
+    {
+        outputs[i].index = i;
+        outputs[i].want_float = (!app_ctx->is_quant);
+    }
+
+    timer.tik();
+    ret = rknn_outputs_get(app_ctx->rknn_ctx, app_ctx->io_num.n_output, outputs, NULL);
+    if (ret < 0)
+    {
+        printf("rknn_outputs_get fail! ret=%d\n", ret);
+        goto out;
+    }
+    timer.tok();
+    timer.print_time("rknn_outputs_get");
+
+    // Post Process
+    timer.tik();
+    post_process(app_ctx, outputs, &letter_box, box_conf_threshold, nms_threshold, od_results);
+    timer.tok();
+    timer.print_time("post_process");
+
+    // Remeber to release rknn output
+    rknn_outputs_release(app_ctx->rknn_ctx, app_ctx->io_num.n_output, outputs);
+
+out:
+    if (dst_img.virt_addr != NULL)
+    {
+        free(dst_img.virt_addr);
+    }
+
+    return ret;
+}
\ No newline at end of file
diff --git a/examples/yolov7/cpp/rknpu2/yolov7.cc b/examples/yolov7/cpp/rknpu2/yolov7.cc
index 57c0ac7..e5ce9d9 100644
--- a/examples/yolov7/cpp/rknpu2/yolov7.cc
+++ b/examples/yolov7/cpp/rknpu2/yolov7.cc
@@ -138,11 +138,6 @@ int init_yolov7_model(const char *model_path, rknn_app_context_t *app_ctx)
 
 int release_yolov7_model(rknn_app_context_t *app_ctx)
 {
-    if (app_ctx->rknn_ctx != 0)
-    {
-        rknn_destroy(app_ctx->rknn_ctx);
-        app_ctx->rknn_ctx = 0;
-    }
     if (app_ctx->input_attrs != NULL)
     {
         free(app_ctx->input_attrs);
@@ -153,6 +148,11 @@ int release_yolov7_model(rknn_app_context_t *app_ctx)
         free(app_ctx->output_attrs);
         app_ctx->output_attrs = NULL;
     }
+    if (app_ctx->rknn_ctx != 0)
+    {
+        rknn_destroy(app_ctx->rknn_ctx);
+        app_ctx->rknn_ctx = 0;
+    }
     return 0;
 }
 
diff --git a/examples/yolov7/cpp/rknpu2/yolov7_rv1106_1103.cc b/examples/yolov7/cpp/rknpu2/yolov7_rv1106_1103.cc
new file mode 100644
index 0000000..85c2545
--- /dev/null
+++ b/examples/yolov7/cpp/rknpu2/yolov7_rv1106_1103.cc
@@ -0,0 +1,248 @@
+// Copyright (c) 2023 by Rockchip Electronics Co., Ltd. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+#include <math.h>
+
+#include "yolov7.h"
+#include "common.h"
+#include "file_utils.h"
+#include "image_utils.h"
+#include "easy_timer.h"
+
+static void dump_tensor_attr(rknn_tensor_attr *attr)
+{
+    printf("  index=%d, name=%s, n_dims=%d, dims=[%d, %d, %d, %d], n_elems=%d, size=%d, fmt=%s, type=%s, qnt_type=%s, "
+           "zp=%d, scale=%f\n",
+           attr->index, attr->name, attr->n_dims, attr->dims[0], attr->dims[1], attr->dims[2], attr->dims[3],
+           attr->n_elems, attr->size, get_format_string(attr->fmt), get_type_string(attr->type),
+           get_qnt_type_string(attr->qnt_type), attr->zp, attr->scale);
+}
+
+int init_yolov7_model(const char *model_path, rknn_app_context_t *app_ctx)
+{
+    int ret;
+    int model_len = 0;
+    char *model;
+    rknn_context ctx = 0;
+
+    ret = rknn_init(&ctx, (char *)model_path, 0, 0, NULL);
+    if (ret < 0)
+    {
+        printf("rknn_init fail! ret=%d\n", ret);
+        return -1;
+    }
+
+    // Get Model Input Output Number
+    rknn_input_output_num io_num;
+    ret = rknn_query(ctx, RKNN_QUERY_IN_OUT_NUM, &io_num, sizeof(io_num));
+    if (ret != RKNN_SUCC)
+    {
+        printf("rknn_query fail! ret=%d\n", ret);
+        return -1;
+    }
+    printf("model input num: %d, output num: %d\n", io_num.n_input, io_num.n_output);
+
+    // Get Model Input Info
+    printf("input tensors:\n");
+    rknn_tensor_attr input_attrs[io_num.n_input];
+    memset(input_attrs, 0, sizeof(input_attrs));
+    for (int i = 0; i < io_num.n_input; i++)
+    {
+        input_attrs[i].index = i;
+        ret = rknn_query(ctx, RKNN_QUERY_NATIVE_INPUT_ATTR, &(input_attrs[i]), sizeof(rknn_tensor_attr));
+        if (ret != RKNN_SUCC)
+        {
+            printf("rknn_query fail! ret=%d\n", ret);
+            return -1;
+        }
+        dump_tensor_attr(&(input_attrs[i]));
+    }
+
+    // Get Model Output Info
+    printf("output tensors:\n");
+    rknn_tensor_attr output_attrs[io_num.n_output];
+    memset(output_attrs, 0, sizeof(output_attrs));
+    for (int i = 0; i < io_num.n_output; i++)
+    {
+        output_attrs[i].index = i;
+        //When using the zero-copy API interface, query the native output tensor attribute
+        ret = rknn_query(ctx, RKNN_QUERY_NATIVE_NHWC_OUTPUT_ATTR, &(output_attrs[i]), sizeof(rknn_tensor_attr));
+        if (ret != RKNN_SUCC)
+        {
+            printf("rknn_query fail! ret=%d\n", ret);
+            return -1;
+        }
+        dump_tensor_attr(&(output_attrs[i]));
+    }
+
+    // default input type is int8 (normalize and quantize need compute in outside)
+    // if set uint8, will fuse normalize and quantize to npu
+    input_attrs[0].type = RKNN_TENSOR_UINT8;
+    // default fmt is NHWC,1106 npu only support NHWC in zero copy mode
+    input_attrs[0].fmt = RKNN_TENSOR_NHWC;
+    printf("input_attrs[0].size_with_stride=%d\n", input_attrs[0].size_with_stride);
+    app_ctx->input_mems[0] = rknn_create_mem(ctx, input_attrs[0].size_with_stride);
+
+    // Set input tensor memory
+    ret = rknn_set_io_mem(ctx, app_ctx->input_mems[0], &input_attrs[0]);
+    if (ret < 0) {
+        printf("input_mems rknn_set_io_mem fail! ret=%d\n", ret);
+        return -1;
+    }
+
+    // Set output tensor memory
+    for (uint32_t i = 0; i < io_num.n_output; ++i) {
+        app_ctx->output_mems[i] = rknn_create_mem(ctx, output_attrs[i].size_with_stride);
+        ret = rknn_set_io_mem(ctx, app_ctx->output_mems[i], &output_attrs[i]);
+        if (ret < 0) {
+            printf("output_mems rknn_set_io_mem fail! ret=%d\n", ret);
+            return -1;
+        }
+    }
+
+    // Set to context
+    app_ctx->rknn_ctx = ctx;
+
+    // TODO
+    if (output_attrs[0].qnt_type == RKNN_TENSOR_QNT_AFFINE_ASYMMETRIC)
+    {
+        app_ctx->is_quant = true;
+    }
+    else
+    {
+        app_ctx->is_quant = false;
+    }
+
+    app_ctx->io_num = io_num;
+    app_ctx->input_attrs = (rknn_tensor_attr *)malloc(io_num.n_input * sizeof(rknn_tensor_attr));
+    memcpy(app_ctx->input_attrs, input_attrs, io_num.n_input * sizeof(rknn_tensor_attr));
+    app_ctx->output_attrs = (rknn_tensor_attr *)malloc(io_num.n_output * sizeof(rknn_tensor_attr));
+    memcpy(app_ctx->output_attrs, output_attrs, io_num.n_output * sizeof(rknn_tensor_attr));
+
+    if (input_attrs[0].fmt == RKNN_TENSOR_NCHW)
+    {
+        printf("model is NCHW input fmt\n");
+        app_ctx->model_channel = input_attrs[0].dims[1];
+        app_ctx->model_height  = input_attrs[0].dims[2];
+        app_ctx->model_width   = input_attrs[0].dims[3];
+    }
+    else
+    {
+        printf("model is NHWC input fmt\n");
+        app_ctx->model_height  = input_attrs[0].dims[1];
+        app_ctx->model_width   = input_attrs[0].dims[2];
+        app_ctx->model_channel = input_attrs[0].dims[3];
+    }
+    printf("model input height=%d, width=%d, channel=%d\n",
+           app_ctx->model_height, app_ctx->model_width, app_ctx->model_channel);
+
+    return 0;
+}
+
+int release_yolov7_model(rknn_app_context_t *app_ctx)
+{    
+    if (app_ctx->input_attrs != NULL)
+    {
+        free(app_ctx->input_attrs);
+        app_ctx->input_attrs = NULL;
+    }
+    if (app_ctx->output_attrs != NULL)
+    {
+        free(app_ctx->output_attrs);
+        app_ctx->output_attrs = NULL;
+    }
+    for (int i = 0; i < app_ctx->io_num.n_input; i++) {
+        if (app_ctx->input_mems[i] != NULL) {
+            rknn_destroy_mem(app_ctx->rknn_ctx, app_ctx->input_mems[i]);
+        }
+    }
+    for (int i = 0; i < app_ctx->io_num.n_output; i++) {
+        if (app_ctx->output_mems[i] != NULL) {
+            rknn_destroy_mem(app_ctx->rknn_ctx, app_ctx->output_mems[i]);
+        }
+    }
+    if (app_ctx->rknn_ctx != 0)
+    {
+        rknn_destroy(app_ctx->rknn_ctx);
+        app_ctx->rknn_ctx = 0;
+    }
+    return 0;
+}
+
+int inference_yolov7_model(rknn_app_context_t *app_ctx, image_buffer_t *img, object_detect_result_list *od_results)
+{
+    int ret;
+    image_buffer_t dst_img;
+    letterbox_t letter_box;
+    const float nms_threshold = NMS_THRESH;      // 默认的NMS阈值
+    const float box_conf_threshold = BOX_THRESH; // 默认的置信度阈值
+    int bg_color = 114;
+    TIMER timer;
+    timer.indent_set("");
+
+    if ((!app_ctx) || !(img) || (!od_results))
+    {
+        return -1;
+    }
+
+    memset(od_results, 0x00, sizeof(*od_results));
+    memset(&letter_box, 0, sizeof(letterbox_t));
+    memset(&dst_img, 0, sizeof(image_buffer_t));
+
+    // Pre Process
+    dst_img.width = app_ctx->model_width;
+    dst_img.height = app_ctx->model_height;
+    dst_img.format = IMAGE_FORMAT_RGB888;
+    dst_img.size = get_image_size(&dst_img);
+    dst_img.fd = app_ctx->input_mems[0]->fd;
+    if (dst_img.virt_addr == NULL && dst_img.fd == 0)
+    {
+        printf("malloc buffer size:%d fail!\n", dst_img.size);
+        return -1;
+    }
+
+    // letterbox
+    timer.tik();
+    ret = convert_image_with_letterbox(img, &dst_img, &letter_box, bg_color);
+    if (ret < 0)
+    {
+        printf("convert_image_with_letterbox fail! ret=%d\n", ret);
+        return -1;
+    }
+    timer.tok();
+    timer.print_time("convert_image_with_letterbox");
+
+    // Run
+    timer.tik();
+    ret = rknn_run(app_ctx->rknn_ctx, nullptr);
+    if (ret < 0)
+    {
+        printf("rknn_run fail! ret=%d\n", ret);
+        return -1;
+    }
+    timer.tok();
+    timer.print_time("rknn_run");
+
+    // Post Process
+    timer.tik();
+    post_process(app_ctx, app_ctx->output_mems, &letter_box, box_conf_threshold, nms_threshold, od_results);
+    timer.tok();
+    timer.print_time("post_process");
+
+out:
+    return ret;
+}
\ No newline at end of file
diff --git a/examples/yolov7/cpp/yolov7.h b/examples/yolov7/cpp/yolov7.h
index b3b42b2..877c694 100644
--- a/examples/yolov7/cpp/yolov7.h
+++ b/examples/yolov7/cpp/yolov7.h
@@ -18,12 +18,24 @@
 
 #include "rknn_api.h"
 #include "common.h"
+#if defined(RV1106_1103) 
+    typedef struct {
+        char *dma_buf_virt_addr;
+        int dma_buf_fd;
+        int size;
+    }rknn_dma_buf;
+#endif
 
 typedef struct {
     rknn_context rknn_ctx;
     rknn_input_output_num io_num;
     rknn_tensor_attr* input_attrs;
     rknn_tensor_attr* output_attrs;
+#if defined(RV1106_1103) 
+    rknn_tensor_mem* input_mems[1];
+    rknn_tensor_mem* output_mems[3];
+    rknn_dma_buf img_dma_buf;
+#endif
     int model_channel;
     int model_width;
     int model_height;
diff --git a/examples/yolov7/model_comparison/yolov7_graph_comparison.jpg b/examples/yolov7/model_comparison/yolov7_graph_comparison.jpg
new file mode 100644
index 0000000..16b1cdf
Binary files /dev/null and b/examples/yolov7/model_comparison/yolov7_graph_comparison.jpg differ
diff --git a/examples/yolov7/model_comparison/yolov7_output_comparison.jpg b/examples/yolov7/model_comparison/yolov7_output_comparison.jpg
new file mode 100644
index 0000000..21f1346
Binary files /dev/null and b/examples/yolov7/model_comparison/yolov7_output_comparison.jpg differ
diff --git a/examples/yolov7/python/convert.py b/examples/yolov7/python/convert.py
index 84da70c..33cf1de 100644
--- a/examples/yolov7/python/convert.py
+++ b/examples/yolov7/python/convert.py
@@ -1,6 +1,4 @@
-import os
 import sys
-import numpy as np
 from rknn.api import RKNN
 
 DATASET_PATH = '../../../datasets/COCO/coco_subset_20.txt'
@@ -9,9 +7,10 @@
 
 def parse_arg():
     if len(sys.argv) < 3:
-        print("Usage: python3 {} onnx_model_path [platform] [dtype(optional)] [output_rknn_path(optional)]".format(sys.argv[0]));
-        print("       platform choose from [rk3562,rk3566,rk3568,rk3588]")
-        print("       dtype choose from    [i8, fp]")
+        print("Usage: python3 {} onnx_model_path [platform] [dtype(optional)] [output_rknn_path(optional)]".format(sys.argv[0]))
+        print("       platform choose from [rk3562,rk3566,rk3568,rk3588,rk1808,rv1109,rv1126]")
+        print("       dtype choose from [i8, fp] for [rk3562,rk3566,rk3568,rk3588]")
+        print("       dtype choose from [u8, fp] for [rk1808,rv1109,rv1126]")
         exit(1)
 
     model_path = sys.argv[1]
@@ -20,10 +19,10 @@ def parse_arg():
     do_quant = DEFAULT_QUANT
     if len(sys.argv) > 3:
         model_type = sys.argv[3]
-        if model_type not in ['i8', 'fp']:
+        if model_type not in ['i8', 'u8', 'fp']:
             print("ERROR: Invalid model type: {}".format(model_type))
             exit(1)
-        elif model_type == 'i8':
+        elif model_type in ['i8', 'u8']:
             do_quant = True
         else:
             do_quant = False
diff --git a/examples/yolov8/README.md b/examples/yolov8/README.md
index faf287e..c3d99ae 100644
--- a/examples/yolov8/README.md
+++ b/examples/yolov8/README.md
@@ -29,7 +29,7 @@ https://github.com/airockchip/ultralytics_yolov8
 
 ## 2. Current Support Platform
 
-RK3566, RK3568, RK3588, RK3562
+RK3566, RK3568, RK3588, RK3562, RK1808, RV1109, RV1126
 
 
 
@@ -46,6 +46,18 @@ cd model
 ./download_model.sh
 ```
 
+**Note**: The model provided here is an optimized model, which is different from the official original model. Take yolov8n.onnx as an example to show the difference between them.
+1. The comparison of their output information is as follows. The left is the official original model, and the right is the optimized model. As shown in the figure, the original one output is divided into three groups. For example, in the set of outputs ([1,64,80,80],[1,80,80,80],[1,1,80,80]), [1,64,80,80] is the coordinate of the box, [1,80,80,80] is the confidence of the box corresponding to the 80 categories, and [1,1,80,80] is the sum of the confidence of the 80 categories.
+
+<div align=center>
+  <img src="./model_comparison/yolov8_output_comparison.jpg" alt="Image">
+</div>
+
+2. Taking the the set of outputs ([1,64,80,80],[1,80,80,80],[1,1,80,80]) as an example, we remove the subgraphs behind the two convolution nodes in the model, keep the outputs of these two convolutions ([1,64,80,80],[1,80,80,80]), and add a reducesum+clip branch for calculating the sum of the confidence of the 80 categories ([1,1,80,80]).
+
+<div align=center>
+  <img src="./model_comparison/yolov8_graph_comparison.jpg" alt="Image">
+</div>
 
 
 ## 4. Convert to RKNN
@@ -65,7 +77,7 @@ python convert.py ../model/yolov8n.onnx rk3588
 
 - `<onnx_model>`: Specify ONNX model path.
 - `<TARGET_PLATFORM>`: Specify NPU platform name. Support Platform refer [here](#2 Current Support Platform).
-- `<dtype>(optional)`: Specify as `i8` or `fp`. `i8` for doing quantization, `fp` for no quantization. Default is `i8`.
+- `<dtype>(optional)`: Specify as `i8`, `u8` or `fp`. `i8`/`u8` for doing quantization, `fp` for no quantization. Default is `i8`.
 - `<output_rknn_path>(optional)`: Specify save path for the RKNN model, default save in the same directory as ONNX model with name `yolov8.rknn`
 
 
@@ -93,29 +105,12 @@ python yolov8.py --model_path <rknn_model> --target <TARGET_PLATFORM> --img_show
 
 ## 6. Android Demo
 
-#### 6.1 Compile and Build
-
-*Usage:*
-
-```sh
-# go back to the rknn_model_zoo root directory
-cd ../../
-export ANDROID_NDK_PATH=<android_ndk_path>
-
-./build-android.sh -t <TARGET_PLATFORM> -a <ARCH> -d yolov8
+**Note: RK1808, RV1109, RV1126 does not support Android.**
 
-# such as 
-./build-android.sh -t rk3588 -a arm64-v8a -d yolov8
-```
+#### 6.1 Compile and Build
 
-*Description:*
-- `<android_ndk_path>`: Specify Android NDK path.
-- `<TARGET_PLATFORM>`: Specify NPU platform name. Support Platform refer [here](#2 Current Support Platform).
-- `<ARCH>`: Specify device system architecture. To query device architecture, refer to the following command:
-	```shell
-	# Query architecture. For Android, ['arm64-v8a' or 'armeabi-v7a'] should shown in log.
-	adb shell cat /proc/version
-	```
+Please refer to the [Compilation_Environment_Setup_Guide](../../docs/Compilation_Environment_Setup_Guide.md#android-platform) document to setup a cross-compilation environment and complete the compilation of C/C++ Demo.  
+**Note: Please replace the model name with `yolov8`.**
 
 #### 6.2 Push demo files to device
 
@@ -151,31 +146,8 @@ export LD_LIBRARY_PATH=./lib
 
 #### 7.1 Compile and Build
 
-*Usage:*
-
-```shell
-# go back to the rknn_model_zoo root directory
-cd ../../
-
-# if GCC_COMPILER not found while building, please set GCC_COMPILER path
-(optional)export GCC_COMPILER=<GCC_COMPILER_PATH>
-
-./build-linux.sh -t <TARGET_PLATFORM> -a <ARCH> -d yolov8
-
-# such as 
-./build-linux.sh -t rk3588 -a aarch64 -d yolov8
-```
-
-*Description:*
-
-- `<GCC_COMPILER_PATH>`: Specified as GCC_COMPILER path.
-- `<TARGET_PLATFORM>` : Specify NPU platform name. Support Platform refer [here](#2 Current Support Platform).
-- `<ARCH>`: Specify device system architecture. To query device architecture, refer to the following command: 
-  
-  ```shell
-  # Query architecture. For Linux, ['aarch64' or 'armhf'] should shown in log.
-  adb shell cat /proc/version
-  ```
+Please refer to the [Compilation_Environment_Setup_Guide](../../docs/Compilation_Environment_Setup_Guide.md#linux-platform) document to setup a cross-compilation environment and complete the compilation of C/C++ Demo.
+**Note: Please replace the model name with `yolov8`.**
 
 #### 7.2 Push demo files to device
 
@@ -221,4 +193,4 @@ person @ (80 326 116 513) 0.311
 
 <img src="result.png">
 
-- Note: Different platforms, different versions of tools and drivers may have slightly different results.
\ No newline at end of file
+- Note: Different platforms, different versions of tools and drivers may have slightly different results.
diff --git a/examples/yolov8/cpp/CMakeLists.txt b/examples/yolov8/cpp/CMakeLists.txt
index 6f41bd8..582fac4 100644
--- a/examples/yolov8/cpp/CMakeLists.txt
+++ b/examples/yolov8/cpp/CMakeLists.txt
@@ -9,6 +9,20 @@ if (ENABLE_ASAN)
 	set (CMAKE_LINKER_FLAGS_DEBUG "${CMAKE_LINKER_FLAGS_DEBUG} -fno-omit-frame-pointer -fsanitize=address")
 endif ()
 
+set(rknpu_yolov8_file rknpu2/yolov8.cc)
+
+if (TARGET_SOC STREQUAL "rv1106" OR TARGET_SOC STREQUAL "rv1103")
+    add_definitions(-DRV1106_1103)
+    set(rknpu_yolov8_file rknpu2/yolov8_rv1106_1103.cc)
+    #dma
+    include_directories(${CMAKE_CURRENT_SOURCE_DIR}/../../../3rdparty/allocator/dma)
+endif()
+
+if(TARGET_SOC STREQUAL "rk1808" OR TARGET_SOC STREQUAL "rv1109" OR TARGET_SOC STREQUAL "rv1126")
+    add_definitions(-DRKNPU1)
+    set(rknpu_yolov8_file rknpu1/yolov8.cc)
+endif()
+
 add_subdirectory(${CMAKE_CURRENT_SOURCE_DIR}/../../../3rdparty/ 3rdparty.out)
 add_subdirectory(${CMAKE_CURRENT_SOURCE_DIR}/../../../utils/ utils.out)
 
@@ -19,14 +33,15 @@ file(GLOB SRCS ${CMAKE_CURRENT_SOURCE_DIR}/*.cc)
 add_executable(${PROJECT_NAME}
     main.cc
     postprocess.cc
-    rknpu2/yolov8.cc
+    ${rknpu_yolov8_file}
 )
 
 target_link_libraries(${PROJECT_NAME}
-    fileutils
     imageutils
+    fileutils
     imagedrawing    
     ${LIBRKNNRT}
+    dl
 )
 
 if (CMAKE_SYSTEM_NAME STREQUAL "Android")
diff --git a/examples/yolov8/cpp/main.cc b/examples/yolov8/cpp/main.cc
index e90045a..b963901 100644
--- a/examples/yolov8/cpp/main.cc
+++ b/examples/yolov8/cpp/main.cc
@@ -25,6 +25,10 @@
 #include "file_utils.h"
 #include "image_drawing.h"
 
+#if defined(RV1106_1103) 
+    #include "dma_alloc.hpp"
+#endif
+
 /*-------------------------------------------
                   Main Function
 -------------------------------------------*/
@@ -55,6 +59,19 @@ int main(int argc, char **argv)
     image_buffer_t src_image;
     memset(&src_image, 0, sizeof(image_buffer_t));
     ret = read_image(image_path, &src_image);
+
+#if defined(RV1106_1103) 
+    //RV1106 rga requires that input and output bufs are memory allocated by dma
+    ret = dma_buf_alloc(RV1106_CMA_HEAP_PATH, src_image.size, &rknn_app_ctx.img_dma_buf.dma_buf_fd, 
+                       (void **) & (rknn_app_ctx.img_dma_buf.dma_buf_virt_addr));
+    memcpy(rknn_app_ctx.img_dma_buf.dma_buf_virt_addr, src_image.virt_addr, src_image.size);
+    dma_sync_cpu_to_device(rknn_app_ctx.img_dma_buf.dma_buf_fd);
+    free(src_image.virt_addr);
+    src_image.virt_addr = (unsigned char *)rknn_app_ctx.img_dma_buf.dma_buf_virt_addr;
+    src_image.fd = rknn_app_ctx.img_dma_buf.dma_buf_fd;
+    rknn_app_ctx.img_dma_buf.size = src_image.size;
+#endif
+    
     if (ret != 0)
     {
         printf("read image fail! ret=%d image_path=%s\n", ret, image_path);
@@ -103,7 +120,12 @@ int main(int argc, char **argv)
 
     if (src_image.virt_addr != NULL)
     {
+#if defined(RV1106_1103) 
+        dma_buf_free(rknn_app_ctx.img_dma_buf.size, &rknn_app_ctx.img_dma_buf.dma_buf_fd, 
+                rknn_app_ctx.img_dma_buf.dma_buf_virt_addr);
+#else
         free(src_image.virt_addr);
+#endif
     }
 
     return 0;
diff --git a/examples/yolov8/cpp/postprocess.cc b/examples/yolov8/cpp/postprocess.cc
index 729b398..3eaa376 100644
--- a/examples/yolov8/cpp/postprocess.cc
+++ b/examples/yolov8/cpp/postprocess.cc
@@ -194,10 +194,18 @@ static int8_t qnt_f32_to_affine(float f32, int32_t zp, float scale)
     return res;
 }
 
+static uint8_t qnt_f32_to_affine_u8(float f32, int32_t zp, float scale)
+{
+    float dst_val = (f32 / scale) + zp;
+    uint8_t res = (uint8_t)__clip(dst_val, 0, 255);
+    return res;
+}
+
 static float deqnt_affine_to_f32(int8_t qnt, int32_t zp, float scale) { return ((float)qnt - (float)zp) * scale; }
 
+static float deqnt_affine_u8_to_f32(uint8_t qnt, int32_t zp, float scale) { return ((float)qnt - (float)zp) * scale; }
 
-void compute_dfl(float* tensor, int dfl_len, float* box){
+static void compute_dfl(float* tensor, int dfl_len, float* box){
     for (int b=0; b<4; b++){
         float exp_t[dfl_len];
         float exp_sum=0;
@@ -214,6 +222,80 @@ void compute_dfl(float* tensor, int dfl_len, float* box){
     }
 }
 
+static int process_u8(uint8_t *box_tensor, int32_t box_zp, float box_scale,
+                      uint8_t *score_tensor, int32_t score_zp, float score_scale,
+                      uint8_t *score_sum_tensor, int32_t score_sum_zp, float score_sum_scale,
+                      int grid_h, int grid_w, int stride, int dfl_len,
+                      std::vector<float> &boxes,
+                      std::vector<float> &objProbs,
+                      std::vector<int> &classId,
+                      float threshold)
+{
+    int validCount = 0;
+    int grid_len = grid_h * grid_w;
+    uint8_t score_thres_u8 = qnt_f32_to_affine_u8(threshold, score_zp, score_scale);
+    uint8_t score_sum_thres_u8 = qnt_f32_to_affine_u8(threshold, score_sum_zp, score_sum_scale);
+
+    for (int i = 0; i < grid_h; i++)
+    {
+        for (int j = 0; j < grid_w; j++)
+        {
+            int offset = i * grid_w + j;
+            int max_class_id = -1;
+
+            // Use score sum to quickly filter
+            if (score_sum_tensor != nullptr)
+            {
+                if (score_sum_tensor[offset] < score_sum_thres_u8)
+                {
+                    continue;
+                }
+            }
+
+            uint8_t max_score = -score_zp;
+            for (int c = 0; c < OBJ_CLASS_NUM; c++)
+            {
+                if ((score_tensor[offset] > score_thres_u8) && (score_tensor[offset] > max_score))
+                {
+                    max_score = score_tensor[offset];
+                    max_class_id = c;
+                }
+                offset += grid_len;
+            }
+
+            // compute box
+            if (max_score > score_thres_u8)
+            {
+                offset = i * grid_w + j;
+                float box[4];
+                float before_dfl[dfl_len * 4];
+                for (int k = 0; k < dfl_len * 4; k++)
+                {
+                    before_dfl[k] = deqnt_affine_u8_to_f32(box_tensor[offset], box_zp, box_scale);
+                    offset += grid_len;
+                }
+                compute_dfl(before_dfl, dfl_len, box);
+
+                float x1, y1, x2, y2, w, h;
+                x1 = (-box[0] + j + 0.5) * stride;
+                y1 = (-box[1] + i + 0.5) * stride;
+                x2 = (box[2] + j + 0.5) * stride;
+                y2 = (box[3] + i + 0.5) * stride;
+                w = x2 - x1;
+                h = y2 - y1;
+                boxes.push_back(x1);
+                boxes.push_back(y1);
+                boxes.push_back(w);
+                boxes.push_back(h);
+
+                objProbs.push_back(deqnt_affine_u8_to_f32(max_score, score_zp, score_scale));
+                classId.push_back(max_class_id);
+                validCount++;
+            }
+        }
+    }
+    return validCount;
+}
 
 static int process_i8(int8_t *box_tensor, int32_t box_zp, float box_scale,
                       int8_t *score_tensor, int32_t score_zp, float score_scale,
@@ -351,8 +433,83 @@ static int process_fp32(float *box_tensor, float *score_tensor, float *score_sum
 }
 
 
-int post_process(rknn_app_context_t *app_ctx, rknn_output *outputs, letterbox_t *letter_box, float conf_threshold, float nms_threshold, object_detect_result_list *od_results)
+#if defined(RV1106_1103)
+static int process_i8_rv1106(int8_t *box_tensor, int32_t box_zp, float box_scale,
+                             int8_t *score_tensor, int32_t score_zp, float score_scale,
+                             int8_t *score_sum_tensor, int32_t score_sum_zp, float score_sum_scale,
+                             int grid_h, int grid_w, int stride, int dfl_len,
+                             std::vector<float> &boxes,
+                             std::vector<float> &objProbs,
+                             std::vector<int> &classId,
+                             float threshold) {
+    int validCount = 0;
+    int grid_len = grid_h * grid_w;
+    int8_t score_thres_i8 = qnt_f32_to_affine(threshold, score_zp, score_scale);
+    int8_t score_sum_thres_i8 = qnt_f32_to_affine(threshold, score_sum_zp, score_sum_scale);
+
+    for (int i = 0; i < grid_h; i++) {
+        for (int j = 0; j < grid_w; j++) {
+            int offset = i * grid_w + j;
+            int max_class_id = -1;
+
+            // 通过 score sum 起到快速过滤的作用
+            if (score_sum_tensor != nullptr) {
+                //score_sum_tensor [1, 1, 80, 80]
+                if (score_sum_tensor[offset] < score_sum_thres_i8) {
+                    continue;
+                }
+            }
+
+            int8_t max_score = -score_zp;
+            offset = offset * OBJ_CLASS_NUM;
+            for (int c = 0; c < OBJ_CLASS_NUM; c++) {
+                if ((score_tensor[offset + c] > score_thres_i8) && (score_tensor[offset + c] > max_score)) {
+                    max_score = score_tensor[offset + c]; //80类 [1, 80, 80, 80] 3588NCHW 1106NHWC
+                    max_class_id = c;
+                }
+            }
+
+            // compute box
+            if (max_score > score_thres_i8) {
+                offset = (i * grid_w + j) * 4 * dfl_len;
+                float box[4];
+                float before_dfl[dfl_len*4];
+                for (int k=0; k< dfl_len*4; k++){
+                    before_dfl[k] = deqnt_affine_to_f32(box_tensor[offset + k], box_zp, box_scale);
+                }
+                compute_dfl(before_dfl, dfl_len, box);
+
+                float x1, y1, x2, y2, w, h;
+                x1 = (-box[0] + j + 0.5) * stride;
+                y1 = (-box[1] + i + 0.5) * stride;
+                x2 = (box[2] + j + 0.5) * stride;
+                y2 = (box[3] + i + 0.5) * stride;
+                w = x2 - x1;
+                h = y2 - y1;
+                boxes.push_back(x1);
+                boxes.push_back(y1);
+                boxes.push_back(w);
+                boxes.push_back(h);
+
+                objProbs.push_back(deqnt_affine_to_f32(max_score, score_zp, score_scale));
+                classId.push_back(max_class_id);
+                validCount ++;
+            }
+        }
+    }
+    printf("validCount=%d\n", validCount);
+    printf("grid h-%d, w-%d, stride %d\n", grid_h, grid_w, stride);
+    return validCount;
+}
+#endif
+
+int post_process(rknn_app_context_t *app_ctx, void *outputs, letterbox_t *letter_box, float conf_threshold, float nms_threshold, object_detect_result_list *od_results)
 {
+#if defined(RV1106_1103) 
+    rknn_tensor_mem **_outputs = (rknn_tensor_mem **)outputs;
+#else
+    rknn_output *_outputs = (rknn_output *)outputs;
+#endif
     std::vector<float> filterBoxes;
     std::vector<float> objProbs;
     std::vector<int> classId;
@@ -366,41 +523,86 @@ int post_process(rknn_app_context_t *app_ctx, rknn_output *outputs, letterbox_t
     memset(od_results, 0, sizeof(object_detect_result_list));
 
     // default 3 branch
+#ifdef RKNPU1
+    int dfl_len = app_ctx->output_attrs[0].dims[2] / 4;
+#else
     int dfl_len = app_ctx->output_attrs[0].dims[1] /4;
+#endif
     int output_per_branch = app_ctx->io_num.n_output / 3;
     for (int i = 0; i < 3; i++)
     {
+#if defined(RV1106_1103)
+        dfl_len = app_ctx->output_attrs[0].dims[3] /4;
+        void *score_sum = nullptr;
+        int32_t score_sum_zp = 0;
+        float score_sum_scale = 1.0;
+        if (output_per_branch == 3) {
+            score_sum = _outputs[i * output_per_branch + 2]->virt_addr;
+            score_sum_zp = app_ctx->output_attrs[i * output_per_branch + 2].zp;
+            score_sum_scale = app_ctx->output_attrs[i * output_per_branch + 2].scale;
+        }
+        int box_idx = i * output_per_branch;
+        int score_idx = i * output_per_branch + 1;
+        grid_h = app_ctx->output_attrs[box_idx].dims[1];
+        grid_w = app_ctx->output_attrs[box_idx].dims[2];
+        stride = model_in_h / grid_h;
+        
+        if (app_ctx->is_quant) {
+            validCount += process_i8_rv1106((int8_t *)_outputs[box_idx]->virt_addr, app_ctx->output_attrs[box_idx].zp, app_ctx->output_attrs[box_idx].scale,
+                                (int8_t *)_outputs[score_idx]->virt_addr, app_ctx->output_attrs[score_idx].zp,
+                                app_ctx->output_attrs[score_idx].scale, (int8_t *)score_sum, score_sum_zp, score_sum_scale,
+                                grid_h, grid_w, stride, dfl_len, filterBoxes, objProbs, classId, conf_threshold);
+        }
+        else
+        {
+            printf("RV1106/1103 only support quantization mode\n", LABEL_NALE_TXT_PATH);
+            return -1;
+        }
 
+#else
         void *score_sum = nullptr;
         int32_t score_sum_zp = 0;
         float score_sum_scale = 1.0;
         if (output_per_branch == 3){
-            score_sum = outputs[i*output_per_branch + 2].buf;
+            score_sum = _outputs[i*output_per_branch + 2].buf;
             score_sum_zp = app_ctx->output_attrs[i*output_per_branch + 2].zp;
             score_sum_scale = app_ctx->output_attrs[i*output_per_branch + 2].scale;
         }
         int box_idx = i*output_per_branch;
         int score_idx = i*output_per_branch + 1;
 
+#ifdef RKNPU1
+        grid_h = app_ctx->output_attrs[box_idx].dims[1];
+        grid_w = app_ctx->output_attrs[box_idx].dims[0];
+#else
         grid_h = app_ctx->output_attrs[box_idx].dims[2];
         grid_w = app_ctx->output_attrs[box_idx].dims[3];
+#endif
         stride = model_in_h / grid_h;
 
         if (app_ctx->is_quant)
         {
-            validCount += process_i8((int8_t *)outputs[box_idx].buf, app_ctx->output_attrs[box_idx].zp, app_ctx->output_attrs[box_idx].scale,
-                                     (int8_t *)outputs[score_idx].buf, app_ctx->output_attrs[score_idx].zp, app_ctx->output_attrs[score_idx].scale,
+#ifdef RKNPU1
+            validCount += process_u8((uint8_t *)_outputs[box_idx].buf, app_ctx->output_attrs[box_idx].zp, app_ctx->output_attrs[box_idx].scale,
+                                     (uint8_t *)_outputs[score_idx].buf, app_ctx->output_attrs[score_idx].zp, app_ctx->output_attrs[score_idx].scale,
+                                     (uint8_t *)score_sum, score_sum_zp, score_sum_scale,
+                                     grid_h, grid_w, stride, dfl_len,
+                                     filterBoxes, objProbs, classId, conf_threshold);
+#else
+            validCount += process_i8((int8_t *)_outputs[box_idx].buf, app_ctx->output_attrs[box_idx].zp, app_ctx->output_attrs[box_idx].scale,
+                                     (int8_t *)_outputs[score_idx].buf, app_ctx->output_attrs[score_idx].zp, app_ctx->output_attrs[score_idx].scale,
                                      (int8_t *)score_sum, score_sum_zp, score_sum_scale,
                                      grid_h, grid_w, stride, dfl_len, 
                                      filterBoxes, objProbs, classId, conf_threshold);
+#endif
         }
         else
         {
-            validCount += process_fp32((float *)outputs[box_idx].buf, (float *)outputs[score_idx].buf, (float *)score_sum,
+            validCount += process_fp32((float *)_outputs[box_idx].buf, (float *)_outputs[score_idx].buf, (float *)score_sum,
                                        grid_h, grid_w, stride, dfl_len, 
                                        filterBoxes, objProbs, classId, conf_threshold);
         }
-
+#endif
     }
 
     // no object detect
diff --git a/examples/yolov8/cpp/postprocess.h b/examples/yolov8/cpp/postprocess.h
index 6d78638..0e932ca 100644
--- a/examples/yolov8/cpp/postprocess.h
+++ b/examples/yolov8/cpp/postprocess.h
@@ -30,7 +30,7 @@ typedef struct {
 int init_post_process();
 void deinit_post_process();
 char *coco_cls_to_name(int cls_id);
-int post_process(rknn_app_context_t *app_ctx, rknn_output *outputs, letterbox_t *letter_box, float conf_threshold, float nms_threshold, object_detect_result_list *od_results);
+int post_process(rknn_app_context_t *app_ctx, void *outputs, letterbox_t *letter_box, float conf_threshold, float nms_threshold, object_detect_result_list *od_results);
 
 void deinitPostProcess();
 #endif //_RKNN_YOLOV8_DEMO_POSTPROCESS_H_
diff --git a/examples/yolov8/cpp/rknpu1/yolov8.cc b/examples/yolov8/cpp/rknpu1/yolov8.cc
new file mode 100644
index 0000000..0681cef
--- /dev/null
+++ b/examples/yolov8/cpp/rknpu1/yolov8.cc
@@ -0,0 +1,250 @@
+// Copyright (c) 2023 by Rockchip Electronics Co., Ltd. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+#include <math.h>
+
+#include "yolov8.h"
+#include "common.h"
+#include "file_utils.h"
+#include "image_utils.h"
+
+static void dump_tensor_attr(rknn_tensor_attr *attr)
+{
+    printf("  index=%d, name=%s, n_dims=%d, dims=[%d, %d, %d, %d], n_elems=%d, size=%d, fmt=%s, type=%s, qnt_type=%s, "
+           "zp=%d, scale=%f\n",
+           attr->index, attr->name, attr->n_dims, attr->dims[3], attr->dims[2], attr->dims[1], attr->dims[0],
+           attr->n_elems, attr->size, get_format_string(attr->fmt), get_type_string(attr->type),
+           get_qnt_type_string(attr->qnt_type), attr->zp, attr->scale);
+}
+
+int init_yolov8_model(const char *model_path, rknn_app_context_t *app_ctx)
+{
+    int ret;
+    int model_len = 0;
+    char *model;
+    rknn_context ctx = 0;
+
+    // Load RKNN Model
+    model_len = read_data_from_file(model_path, &model);
+    if (model == NULL)
+    {
+        printf("load_model fail!\n");
+        return -1;
+    }
+
+    ret = rknn_init(&ctx, model, model_len, 0);
+    free(model);
+    if (ret < 0)
+    {
+        printf("rknn_init fail! ret=%d\n", ret);
+        return -1;
+    }
+
+    // Get Model Input Output Number
+    rknn_input_output_num io_num;
+    ret = rknn_query(ctx, RKNN_QUERY_IN_OUT_NUM, &io_num, sizeof(io_num));
+    if (ret != RKNN_SUCC)
+    {
+        printf("rknn_query fail! ret=%d\n", ret);
+        return -1;
+    }
+    printf("model input num: %d, output num: %d\n", io_num.n_input, io_num.n_output);
+
+    // Get Model Input Info
+    printf("input tensors:\n");
+    rknn_tensor_attr input_attrs[io_num.n_input];
+    memset(input_attrs, 0, sizeof(input_attrs));
+    for (int i = 0; i < io_num.n_input; i++)
+    {
+        input_attrs[i].index = i;
+        ret = rknn_query(ctx, RKNN_QUERY_INPUT_ATTR, &(input_attrs[i]), sizeof(rknn_tensor_attr));
+        if (ret != RKNN_SUCC)
+        {
+            printf("rknn_query fail! ret=%d\n", ret);
+            return -1;
+        }
+        dump_tensor_attr(&(input_attrs[i]));
+    }
+
+    // Get Model Output Info
+    printf("output tensors:\n");
+    rknn_tensor_attr output_attrs[io_num.n_output];
+    memset(output_attrs, 0, sizeof(output_attrs));
+    for (int i = 0; i < io_num.n_output; i++)
+    {
+        output_attrs[i].index = i;
+        ret = rknn_query(ctx, RKNN_QUERY_OUTPUT_ATTR, &(output_attrs[i]), sizeof(rknn_tensor_attr));
+        if (ret != RKNN_SUCC)
+        {
+            printf("rknn_query fail! ret=%d\n", ret);
+            return -1;
+        }
+        dump_tensor_attr(&(output_attrs[i]));
+    }
+
+    // Set to context
+    app_ctx->rknn_ctx = ctx;
+
+    // TODO
+    if (output_attrs[0].qnt_type == RKNN_TENSOR_QNT_AFFINE_ASYMMETRIC && output_attrs[0].type == RKNN_TENSOR_UINT8)
+    {
+        app_ctx->is_quant = true;
+    }
+    else
+    {
+        app_ctx->is_quant = false;
+    }
+
+    app_ctx->io_num = io_num;
+    app_ctx->input_attrs = (rknn_tensor_attr *)malloc(io_num.n_input * sizeof(rknn_tensor_attr));
+    memcpy(app_ctx->input_attrs, input_attrs, io_num.n_input * sizeof(rknn_tensor_attr));
+    app_ctx->output_attrs = (rknn_tensor_attr *)malloc(io_num.n_output * sizeof(rknn_tensor_attr));
+    memcpy(app_ctx->output_attrs, output_attrs, io_num.n_output * sizeof(rknn_tensor_attr));
+
+    if (input_attrs[0].fmt == RKNN_TENSOR_NCHW)
+    {
+        printf("model is NCHW input fmt\n");
+        app_ctx->model_channel = input_attrs[0].dims[2];
+        app_ctx->model_height = input_attrs[0].dims[1];
+        app_ctx->model_width = input_attrs[0].dims[0];
+    }
+    else
+    {
+        printf("model is NHWC input fmt\n");
+        app_ctx->model_height = input_attrs[0].dims[2];
+        app_ctx->model_width = input_attrs[0].dims[1];
+        app_ctx->model_channel = input_attrs[0].dims[0];
+    }
+    printf("model input height=%d, width=%d, channel=%d\n",
+           app_ctx->model_height, app_ctx->model_width, app_ctx->model_channel);
+
+    return 0;
+}
+
+int release_yolov8_model(rknn_app_context_t *app_ctx)
+{
+    if (app_ctx->input_attrs != NULL)
+    {
+        free(app_ctx->input_attrs);
+        app_ctx->input_attrs = NULL;
+    }
+    if (app_ctx->output_attrs != NULL)
+    {
+        free(app_ctx->output_attrs);
+        app_ctx->output_attrs = NULL;
+    }
+    if (app_ctx->rknn_ctx != 0)
+    {
+        rknn_destroy(app_ctx->rknn_ctx);
+        app_ctx->rknn_ctx = 0;
+    }
+    return 0;
+}
+
+int inference_yolov8_model(rknn_app_context_t *app_ctx, image_buffer_t *img, object_detect_result_list *od_results)
+{
+    int ret;
+    image_buffer_t dst_img;
+    letterbox_t letter_box;
+    rknn_input inputs[app_ctx->io_num.n_input];
+    rknn_output outputs[app_ctx->io_num.n_output];
+    const float nms_threshold = NMS_THRESH;      // Default NMS threshold
+    const float box_conf_threshold = BOX_THRESH; // Default box threshold
+    int bg_color = 114;
+
+    if ((!app_ctx) || !(img) || (!od_results))
+    {
+        return -1;
+    }
+
+    memset(od_results, 0x00, sizeof(*od_results));
+    memset(&letter_box, 0, sizeof(letterbox_t));
+    memset(&dst_img, 0, sizeof(image_buffer_t));
+    memset(inputs, 0, sizeof(inputs));
+    memset(outputs, 0, sizeof(outputs));
+
+    // Pre Process
+    dst_img.width = app_ctx->model_width;
+    dst_img.height = app_ctx->model_height;
+    dst_img.format = IMAGE_FORMAT_RGB888;
+    dst_img.size = get_image_size(&dst_img);
+    dst_img.virt_addr = (unsigned char *)malloc(dst_img.size);
+    if (dst_img.virt_addr == NULL)
+    {
+        printf("malloc buffer size:%d fail!\n", dst_img.size);
+        return -1;
+    }
+
+    // letterbox
+    ret = convert_image_with_letterbox(img, &dst_img, &letter_box, bg_color);
+    if (ret < 0)
+    {
+        printf("convert_image_with_letterbox fail! ret=%d\n", ret);
+        return -1;
+    }
+
+    // Set Input Data
+    inputs[0].index = 0;
+    inputs[0].type = RKNN_TENSOR_UINT8;
+    inputs[0].fmt = RKNN_TENSOR_NHWC;
+    inputs[0].size = app_ctx->model_width * app_ctx->model_height * app_ctx->model_channel;
+    inputs[0].buf = dst_img.virt_addr;
+
+    ret = rknn_inputs_set(app_ctx->rknn_ctx, app_ctx->io_num.n_input, inputs);
+    if (ret < 0)
+    {
+        printf("rknn_input_set fail! ret=%d\n", ret);
+        return -1;
+    }
+
+    // Run
+    printf("rknn_run\n");
+    ret = rknn_run(app_ctx->rknn_ctx, nullptr);
+    if (ret < 0)
+    {
+        printf("rknn_run fail! ret=%d\n", ret);
+        return -1;
+    }
+
+    // Get Output
+    memset(outputs, 0, sizeof(outputs));
+    for (int i = 0; i < app_ctx->io_num.n_output; i++)
+    {
+        outputs[i].index = i;
+        outputs[i].want_float = (!app_ctx->is_quant);
+    }
+    ret = rknn_outputs_get(app_ctx->rknn_ctx, app_ctx->io_num.n_output, outputs, NULL);
+    if (ret < 0)
+    {
+        printf("rknn_outputs_get fail! ret=%d\n", ret);
+        goto out;
+    }
+
+    // Post Process
+    post_process(app_ctx, outputs, &letter_box, box_conf_threshold, nms_threshold, od_results);
+
+    // Remeber to release rknn output
+    rknn_outputs_release(app_ctx->rknn_ctx, app_ctx->io_num.n_output, outputs);
+
+out:
+    if (dst_img.virt_addr != NULL)
+    {
+        free(dst_img.virt_addr);
+    }
+
+    return ret;
+}
\ No newline at end of file
diff --git a/examples/yolov8/cpp/rknpu2/yolov8.cc b/examples/yolov8/cpp/rknpu2/yolov8.cc
index f56a9d1..abad5aa 100644
--- a/examples/yolov8/cpp/rknpu2/yolov8.cc
+++ b/examples/yolov8/cpp/rknpu2/yolov8.cc
@@ -137,11 +137,6 @@ int init_yolov8_model(const char *model_path, rknn_app_context_t *app_ctx)
 
 int release_yolov8_model(rknn_app_context_t *app_ctx)
 {
-    if (app_ctx->rknn_ctx != 0)
-    {
-        rknn_destroy(app_ctx->rknn_ctx);
-        app_ctx->rknn_ctx = 0;
-    }
     if (app_ctx->input_attrs != NULL)
     {
         free(app_ctx->input_attrs);
@@ -152,6 +147,11 @@ int release_yolov8_model(rknn_app_context_t *app_ctx)
         free(app_ctx->output_attrs);
         app_ctx->output_attrs = NULL;
     }
+    if (app_ctx->rknn_ctx != 0)
+    {
+        rknn_destroy(app_ctx->rknn_ctx);
+        app_ctx->rknn_ctx = 0;
+    }
     return 0;
 }
 
diff --git a/examples/yolov8/cpp/rknpu2/yolov8_rv1106_1103.cc b/examples/yolov8/cpp/rknpu2/yolov8_rv1106_1103.cc
new file mode 100644
index 0000000..e619a8c
--- /dev/null
+++ b/examples/yolov8/cpp/rknpu2/yolov8_rv1106_1103.cc
@@ -0,0 +1,234 @@
+// Copyright (c) 2023 by Rockchip Electronics Co., Ltd. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+#include <math.h>
+
+#include "yolov8.h"
+#include "common.h"
+#include "file_utils.h"
+#include "image_utils.h"
+
+static void dump_tensor_attr(rknn_tensor_attr *attr)
+{
+    printf("  index=%d, name=%s, n_dims=%d, dims=[%d, %d, %d, %d], n_elems=%d, size=%d, fmt=%s, type=%s, qnt_type=%s, "
+           "zp=%d, scale=%f\n",
+           attr->index, attr->name, attr->n_dims, attr->dims[0], attr->dims[1], attr->dims[2], attr->dims[3],
+           attr->n_elems, attr->size, get_format_string(attr->fmt), get_type_string(attr->type),
+           get_qnt_type_string(attr->qnt_type), attr->zp, attr->scale);
+}
+
+int init_yolov8_model(const char *model_path, rknn_app_context_t *app_ctx)
+{
+    int ret;
+    int model_len = 0;
+    char *model;
+    rknn_context ctx = 0;
+
+    ret = rknn_init(&ctx, (char *)model_path, 0, 0, NULL);
+    if (ret < 0)
+    {
+        printf("rknn_init fail! ret=%d\n", ret);
+        return -1;
+    }
+
+    // Get Model Input Output Number
+    rknn_input_output_num io_num;
+    ret = rknn_query(ctx, RKNN_QUERY_IN_OUT_NUM, &io_num, sizeof(io_num));
+    if (ret != RKNN_SUCC)
+    {
+        printf("rknn_query fail! ret=%d\n", ret);
+        return -1;
+    }
+    printf("model input num: %d, output num: %d\n", io_num.n_input, io_num.n_output);
+
+    // Get Model Input Info
+    printf("input tensors:\n");
+    rknn_tensor_attr input_attrs[io_num.n_input];
+    memset(input_attrs, 0, sizeof(input_attrs));
+    for (int i = 0; i < io_num.n_input; i++)
+    {
+        input_attrs[i].index = i;
+        ret = rknn_query(ctx, RKNN_QUERY_NATIVE_INPUT_ATTR, &(input_attrs[i]), sizeof(rknn_tensor_attr));
+        if (ret != RKNN_SUCC)
+        {
+            printf("rknn_query fail! ret=%d\n", ret);
+            return -1;
+        }
+        dump_tensor_attr(&(input_attrs[i]));
+    }
+
+    // Get Model Output Info
+    printf("output tensors:\n");
+    rknn_tensor_attr output_attrs[io_num.n_output];
+    memset(output_attrs, 0, sizeof(output_attrs));
+    for (int i = 0; i < io_num.n_output; i++)
+    {
+        output_attrs[i].index = i;
+        //When using the zero-copy API interface, query the native output tensor attribute
+        ret = rknn_query(ctx, RKNN_QUERY_NATIVE_NHWC_OUTPUT_ATTR, &(output_attrs[i]), sizeof(rknn_tensor_attr));
+        if (ret != RKNN_SUCC)
+        {
+            printf("rknn_query fail! ret=%d\n", ret);
+            return -1;
+        }
+        dump_tensor_attr(&(output_attrs[i]));
+    }
+
+    // default input type is int8 (normalize and quantize need compute in outside)
+    // if set uint8, will fuse normalize and quantize to npu
+    input_attrs[0].type = RKNN_TENSOR_UINT8;
+    // default fmt is NHWC,1106 npu only support NHWC in zero copy mode
+    input_attrs[0].fmt = RKNN_TENSOR_NHWC;
+    printf("input_attrs[0].size_with_stride=%d\n", input_attrs[0].size_with_stride);
+    app_ctx->input_mems[0] = rknn_create_mem(ctx, input_attrs[0].size_with_stride);
+
+    // Set input tensor memory
+    ret = rknn_set_io_mem(ctx, app_ctx->input_mems[0], &input_attrs[0]);
+    if (ret < 0) {
+        printf("input_mems rknn_set_io_mem fail! ret=%d\n", ret);
+        return -1;
+    }
+
+    // Set output tensor memory
+    for (uint32_t i = 0; i < io_num.n_output; ++i) {
+        app_ctx->output_mems[i] = rknn_create_mem(ctx, output_attrs[i].size_with_stride);
+        ret = rknn_set_io_mem(ctx, app_ctx->output_mems[i], &output_attrs[i]);
+        if (ret < 0) {
+            printf("output_mems rknn_set_io_mem fail! ret=%d\n", ret);
+            return -1;
+        }
+    }
+
+    // Set to context
+    app_ctx->rknn_ctx = ctx;
+
+    // TODO
+    if (output_attrs[0].qnt_type == RKNN_TENSOR_QNT_AFFINE_ASYMMETRIC)
+    {
+        app_ctx->is_quant = true;
+    }
+    else
+    {
+        app_ctx->is_quant = false;
+    }
+
+    app_ctx->io_num = io_num;
+    app_ctx->input_attrs = (rknn_tensor_attr *)malloc(io_num.n_input * sizeof(rknn_tensor_attr));
+    memcpy(app_ctx->input_attrs, input_attrs, io_num.n_input * sizeof(rknn_tensor_attr));
+    app_ctx->output_attrs = (rknn_tensor_attr *)malloc(io_num.n_output * sizeof(rknn_tensor_attr));
+    memcpy(app_ctx->output_attrs, output_attrs, io_num.n_output * sizeof(rknn_tensor_attr));
+
+    if (input_attrs[0].fmt == RKNN_TENSOR_NCHW) 
+    {
+        printf("model is NCHW input fmt\n");
+        app_ctx->model_channel = input_attrs[0].dims[1];
+        app_ctx->model_height  = input_attrs[0].dims[2];
+        app_ctx->model_width   = input_attrs[0].dims[3];
+    } else 
+    {
+        printf("model is NHWC input fmt\n");
+        app_ctx->model_height  = input_attrs[0].dims[1];
+        app_ctx->model_width   = input_attrs[0].dims[2];
+        app_ctx->model_channel = input_attrs[0].dims[3];
+    } 
+
+    printf("model input height=%d, width=%d, channel=%d\n",
+           app_ctx->model_height, app_ctx->model_width, app_ctx->model_channel);
+
+    return 0;
+}
+
+int release_yolov8_model(rknn_app_context_t *app_ctx)
+{    
+    if (app_ctx->input_attrs != NULL)
+    {
+        free(app_ctx->input_attrs);
+        app_ctx->input_attrs = NULL;
+    }
+    if (app_ctx->output_attrs != NULL)
+    {
+        free(app_ctx->output_attrs);
+        app_ctx->output_attrs = NULL;
+    }
+    for (int i = 0; i < app_ctx->io_num.n_input; i++) {
+        if (app_ctx->input_mems[i] != NULL) {
+            rknn_destroy_mem(app_ctx->rknn_ctx, app_ctx->input_mems[i]);
+        }
+    }
+    for (int i = 0; i < app_ctx->io_num.n_output; i++) {
+        if (app_ctx->output_mems[i] != NULL) {
+            rknn_destroy_mem(app_ctx->rknn_ctx, app_ctx->output_mems[i]);
+        }
+    }
+    if (app_ctx->rknn_ctx != 0)
+    {
+        rknn_destroy(app_ctx->rknn_ctx);
+        app_ctx->rknn_ctx = 0;
+    }
+    return 0;
+}
+
+int inference_yolov8_model(rknn_app_context_t *app_ctx, image_buffer_t *img, object_detect_result_list *od_results)
+{
+    int ret;
+    image_buffer_t dst_img;
+    letterbox_t letter_box;
+    const float nms_threshold = NMS_THRESH;      // 默认的NMS阈值
+    const float box_conf_threshold = BOX_THRESH; // 默认的置信度阈值
+    int bg_color = 114;
+    
+    if ((!app_ctx) || !(img) || (!od_results))
+    {
+        return -1;
+    }
+    memset(od_results, 0x00, sizeof(*od_results));
+    memset(&letter_box, 0, sizeof(letterbox_t));
+    memset(&dst_img, 0, sizeof(image_buffer_t));
+
+    // Pre Process
+    dst_img.width = app_ctx->model_width;
+    dst_img.height = app_ctx->model_height;
+    dst_img.format = IMAGE_FORMAT_RGB888;
+    dst_img.size = get_image_size(&dst_img);
+    dst_img.fd = app_ctx->input_mems[0]->fd;
+    if (dst_img.virt_addr == NULL && dst_img.fd == 0)
+    {
+        printf("malloc buffer size:%d fail!\n", dst_img.size);
+        return -1;
+    }
+
+    // letterbox
+    ret = convert_image_with_letterbox(img, &dst_img, &letter_box, bg_color);
+    if (ret < 0)
+    {
+        printf("convert_image_with_letterbox fail! ret=%d\n", ret);
+        return -1;
+    }
+
+    // Run
+    printf("rknn_run\n");
+    ret = rknn_run(app_ctx->rknn_ctx, nullptr);
+    if (ret < 0) {
+        printf("rknn_run fail! ret=%d\n", ret);
+        return -1;
+    }
+
+    // Post Process
+    post_process(app_ctx, app_ctx->output_mems, &letter_box, box_conf_threshold, nms_threshold, od_results);
+out:
+    return ret;
+}
\ No newline at end of file
diff --git a/examples/yolov8/cpp/yolov8.h b/examples/yolov8/cpp/yolov8.h
index d264d1b..dac331d 100644
--- a/examples/yolov8/cpp/yolov8.h
+++ b/examples/yolov8/cpp/yolov8.h
@@ -19,11 +19,24 @@
 #include "rknn_api.h"
 #include "common.h"
 
+#if defined(RV1106_1103) 
+    typedef struct {
+        char *dma_buf_virt_addr;
+        int dma_buf_fd;
+        int size;
+    }rknn_dma_buf;
+#endif
+
 typedef struct {
     rknn_context rknn_ctx;
     rknn_input_output_num io_num;
     rknn_tensor_attr* input_attrs;
     rknn_tensor_attr* output_attrs;
+#if defined(RV1106_1103) 
+    rknn_tensor_mem* input_mems[1];
+    rknn_tensor_mem* output_mems[9];
+    rknn_dma_buf img_dma_buf;
+#endif
     int model_channel;
     int model_width;
     int model_height;
diff --git a/examples/yolov8/model_comparison/yolov8_graph_comparison.jpg b/examples/yolov8/model_comparison/yolov8_graph_comparison.jpg
new file mode 100644
index 0000000..2cbb4b9
Binary files /dev/null and b/examples/yolov8/model_comparison/yolov8_graph_comparison.jpg differ
diff --git a/examples/yolov8/model_comparison/yolov8_output_comparison.jpg b/examples/yolov8/model_comparison/yolov8_output_comparison.jpg
new file mode 100644
index 0000000..00378ce
Binary files /dev/null and b/examples/yolov8/model_comparison/yolov8_output_comparison.jpg differ
diff --git a/examples/yolov8/python/convert.py b/examples/yolov8/python/convert.py
index 651c2c0..32fa8dd 100644
--- a/examples/yolov8/python/convert.py
+++ b/examples/yolov8/python/convert.py
@@ -1,6 +1,4 @@
-import os
 import sys
-import numpy as np
 from rknn.api import RKNN
 
 DATASET_PATH = '../../../datasets/COCO/coco_subset_20.txt'
@@ -10,8 +8,9 @@
 def parse_arg():
     if len(sys.argv) < 3:
         print("Usage: python3 {} onnx_model_path [platform] [dtype(optional)] [output_rknn_path(optional)]".format(sys.argv[0]));
-        print("       platform choose from [rk3562,rk3566,rk3568,rk3588]")
-        print("       dtype choose from    [i8, fp]")
+        print("       platform choose from [rk3562,rk3566,rk3568,rk3588,rk1808,rv1109,rv1126]")
+        print("       dtype choose from [i8, fp] for [rk3562,rk3566,rk3568,rk3588]")
+        print("       dtype choose from [u8, fp] for [rk1808,rv1109,rv1126]")
         exit(1)
 
     model_path = sys.argv[1]
@@ -20,10 +19,10 @@ def parse_arg():
     do_quant = DEFAULT_QUANT
     if len(sys.argv) > 3:
         model_type = sys.argv[3]
-        if model_type not in ['i8', 'fp']:
+        if model_type not in ['i8', 'u8', 'fp']:
             print("ERROR: Invalid model type: {}".format(model_type))
             exit(1)
-        elif model_type == 'i8':
+        elif model_type in ['i8', 'u8']:
             do_quant = True
         else:
             do_quant = False
diff --git a/examples/yolov8_seg/README.md b/examples/yolov8_seg/README.md
index e11c25a..5316db3 100644
--- a/examples/yolov8_seg/README.md
+++ b/examples/yolov8_seg/README.md
@@ -2,6 +2,7 @@
 
 ## Table of contents
 
+- [Table of contents](#table-of-contents)
 - [1. Description](#1-description)
 - [2. Current Support Platform](#2-current-support-platform)
 - [3. Pretrained Model](#3-pretrained-model)
@@ -29,7 +30,7 @@ https://github.com/airockchip/ultralytics_yolov8
 
 ## 2. Current Support Platform
 
-RK3566, RK3568, RK3588, RK3562
+RK3566, RK3568, RK3588, RK3562, RK1808, RV1109, RV1126
 
 
 
@@ -46,6 +47,18 @@ cd model
 ./download_model.sh
 ```
 
+**Note**: The model provided here is an optimized model, which is different from the official original model. Take yolov8n-seg.onnx as an example to show the difference between them.
+1. The comparison of their output information is as follows. The left is the official original model, and the right is the optimized model. As shown in the figure, the original output [1,116,8400] is divided into three groups. For example, in the set of outputs ([1,64,80,80],[1,80,80,80],[1,1,80,80],[1,32,80,80]),[1,64,80,80] is the coordinate of the box, [1,80,80,80] is the confidence of the box corresponding to the 80 categories, [1,1,80,80] is the sum of the confidence of the 80 categories, and [1,32,80,80] is the segmentation feature.
+
+<div align=center>
+  <img src="./model_comparison/yolov8_seg_output_comparison.jpg" alt="Image">
+</div>
+
+2. Taking the the set of outputs ([1,64,20,20],[1,80,20,20],[1,1,20,20],[1,32,20,20]) as an example, we remove the subgraphs behind the three convolution nodes in the model (the framed part in the figure), keep the outputs of these three convolutions ([1,64,20,20],[1,80,20,20],[1,32,20,20]), and add a reducesum+clip branch for calculating the sum of the confidence of the 80 categories ([1,1,20,20]).
+
+<div align=center>
+  <img src="./model_comparison/yolov8_seg_graph_comparison.jpg" alt="Image">
+</div>
 
 ## 4. Convert to RKNN
 
@@ -62,7 +75,7 @@ python convert.py <onnx_model> <TARGET_PLATFORM>
 
 - `<onnx_model>`: Specify ONNX model path.
 - `<TARGET_PLATFORM>`: Specify NPU platform name. Support Platform refer [here](#2 Current Support Platform).
-- `<dtype>(optional)`: Specify as `i8` or `fp`. `i8` for doing quantization, `fp` for no quantization. Default is `i8`.
+- `<dtype>(optional)`: Specify as `i8`, `u8` or `fp`. `i8`/`u8` for doing quantization, `fp` for no quantization. Default is `i8`/`u8`.
 - `<output_rknn_path>(optional)`: Specify save path for the RKNN model, default save in the same directory as ONNX model with name `yolov8_seg.rknn`
 
 
@@ -91,30 +104,12 @@ python yolov8_seg.py --model_path {rknn_model} --target {target_platform} --anno
 
 
 ## 6. Android Demo
+**Note: RK1808, RV1109, RV1126 does not support Android.**
 
 #### 6.1 Compile and Build
 
-*Usage:*
-
-```sh
-# go back to the rknn_model_zoo root directory
-cd ../../
-export ANDROID_NDK_PATH=<android_ndk_path>
-
-./build-android.sh -t <TARGET_PLATFORM> -a <ARCH> -d yolov8_seg
-
-# such as 
-./build-android.sh -t rk3588 -a arm64-v8a -d yolov8_seg
-```
-
-*Description:*
-- `<android_ndk_path>`: Specify Android NDK path.
-- `<TARGET_PLATFORM>`: Specify NPU platform name. Support Platform refer [here](#2 Current Support Platform).
-- `<ARCH>`: Specify device system architecture. To query device architecture, refer to the following command:
-	```shell
-	# Query architecture. For Android, ['arm64-v8a' or 'armeabi-v7a'] should shown in log.
-	adb shell cat /proc/version
-	```
+Please refer to the [Compilation_Environment_Setup_Guide](../../docs/Compilation_Environment_Setup_Guide.md#android-platform) document to setup a cross-compilation environment and complete the compilation of C/C++ Demo.  
+**Note: Please replace the model name with `yolov8_seg`.**
 
 #### 6.2 Push demo files to device
 
@@ -148,31 +143,8 @@ export LD_LIBRARY_PATH=./lib
 
 #### 7.1 Compile and Build
 
-*usage*
-
-```shell
-# go back to the rknn_model_zoo root directory
-cd ../../
-
-# if GCC_COMPILER not found while building, please set GCC_COMPILER path
-(optional)export GCC_COMPILER=<GCC_COMPILER_PATH>
-
-./build-linux.sh -t <TARGET_PLATFORM> -a <ARCH> -d yolov8_seg
-
-# such as 
-./build-linux.sh -t rk3588 -a aarch64 -d yolov8_seg
-```
-
-*Description:*
-
-- `<GCC_COMPILER_PATH>`: Specified as GCC_COMPILER path.
-- `<TARGET_PLATFORM>` : Specify NPU platform name. Support Platform refer [here](#2 Current Support Platform).
-- `<ARCH>`: Specify device system architecture. To query device architecture, refer to the following command: 
-  
-  ```shell
-  # Query architecture. For Linux, ['aarch64' or 'armhf'] should shown in log.
-  adb shell cat /proc/version
-  ```
+Please refer to the [Compilation_Environment_Setup_Guide](../../docs/Compilation_Environment_Setup_Guide.md#linux-platform) document to setup a cross-compilation environment and complete the compilation of C/C++ Demo.  
+**Note: Please replace the model name with `yolov8_seg`.**
 
 #### 7.2 Push demo files to device
 
diff --git a/examples/yolov8_seg/cpp/CMakeLists.txt b/examples/yolov8_seg/cpp/CMakeLists.txt
index 4c58223..a64392c 100644
--- a/examples/yolov8_seg/cpp/CMakeLists.txt
+++ b/examples/yolov8_seg/cpp/CMakeLists.txt
@@ -43,19 +43,38 @@ include_directories(${CMAKE_CURRENT_SOURCE_DIR}/../../../3rdparty/allocator/dma)
 #drm
 include_directories(${CMAKE_CURRENT_SOURCE_DIR}/../../../3rdparty/allocator/drm)
 
+if (TARGET_SOC STREQUAL "rk1808" OR TARGET_SOC STREQUAL "rv1109" OR TARGET_SOC STREQUAL "rv1126")
+    set(postprocess_file rknpu1/postprocess.cc)
+    set(yolov8_seg_file rknpu1/yolov8_seg.cc)
+    #matmul
+else()
+    set(postprocess_file rknpu2/postprocess.cc)
+    set(yolov8_seg_file rknpu2/yolov8_seg.cc)
+endif()
+
 add_executable(${PROJECT_NAME}
     main.cc
-    postprocess.cc
-    rknpu2/yolov8_seg.cc
+    ${postprocess_file}
+    ${yolov8_seg_file}
 )
 
-target_link_libraries(${PROJECT_NAME}
-    fileutils
-    imageutils
-    imagedrawing
-    ${OpenCV_LIBS}    
-    ${LIBRKNNRT}
-)
+if (TARGET_SOC STREQUAL "rk1808" OR TARGET_SOC STREQUAL "rv1109" OR TARGET_SOC STREQUAL "rv1126")
+  target_link_libraries(${PROJECT_NAME}
+      fileutils
+      imageutils
+      imagedrawing
+      ${OpenCV_LIBS}    
+      ${LIBRKNNRT}
+  )
+else()
+  target_link_libraries(${PROJECT_NAME}
+      fileutils
+      imageutils
+      imagedrawing
+      ${OpenCV_LIBS}    
+      ${LIBRKNNRT}
+  )
+endif()
 
 if (CMAKE_SYSTEM_NAME STREQUAL "Android")
     target_link_libraries(${PROJECT_NAME}
diff --git a/examples/yolov8_seg/cpp/easy_timer.h b/examples/yolov8_seg/cpp/easy_timer.h
new file mode 100644
index 0000000..755c226
--- /dev/null
+++ b/examples/yolov8_seg/cpp/easy_timer.h
@@ -0,0 +1,43 @@
+#include <sys/time.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+
+class TIMER{
+    private:
+        struct timeval start_time, stop_time;
+        double __get_us(struct timeval t) { return (t.tv_sec * 1000000 + t.tv_usec); }
+        char indent[40];
+
+    public:
+        TIMER(){}
+        ~TIMER(){}
+
+        void indent_set(char* s){
+            strcpy(indent, s);
+        }
+        void indent_set(const char* s){
+            strcpy(indent, s);
+        }
+
+        void tik(){
+            gettimeofday(&start_time, NULL);
+        }
+
+        void tok(){
+            gettimeofday(&stop_time, NULL);
+        }
+
+        void print_time(char* str){
+            printf("%s", indent);
+            printf("%s use: %f ms\n", str, get_time());
+        }
+        void print_time(const char* str){
+            printf("%s", indent);
+            printf("%s use: %f ms\n", str, get_time());
+        }
+
+        float get_time(){
+            return (__get_us(stop_time) - __get_us(start_time))/1000;
+        }
+};
\ No newline at end of file
diff --git a/examples/yolov8_seg/cpp/rknpu1/postprocess.cc b/examples/yolov8_seg/cpp/rknpu1/postprocess.cc
new file mode 100644
index 0000000..83db706
--- /dev/null
+++ b/examples/yolov8_seg/cpp/rknpu1/postprocess.cc
@@ -0,0 +1,833 @@
+// Copyright (c) 2021 by Rockchip Electronics Co., Ltd. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "yolov8_seg.h"
+
+#include <math.h>
+#include <stdint.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+#include <sys/time.h>
+#include <opencv2/opencv.hpp>
+#include "easy_timer.h"
+
+#include <set>
+#include <vector>
+#define LABEL_NALE_TXT_PATH "./model/coco_80_labels_list.txt"
+// #define USE_FP_RESIZE
+
+static char *labels[OBJ_CLASS_NUM];
+
+int clamp(float val, int min, int max)
+{
+    return val > min ? (val < max ? val : max) : min;
+}
+
+static char *readLine(FILE *fp, char *buffer, int *len)
+{
+    int ch;
+    int i = 0;
+    size_t buff_len = 0;
+
+    buffer = (char *)malloc(buff_len + 1);
+    if (!buffer)
+        return NULL; // Out of memory
+
+    while ((ch = fgetc(fp)) != '\n' && ch != EOF)
+    {
+        buff_len++;
+        void *tmp = realloc(buffer, buff_len + 1);
+        if (tmp == NULL)
+        {
+            free(buffer);
+            return NULL; // Out of memory
+        }
+        buffer = (char *)tmp;
+
+        buffer[i] = (char)ch;
+        i++;
+    }
+    buffer[i] = '\0';
+
+    *len = buff_len;
+
+    // Detect end
+    if (ch == EOF && (i == 0 || ferror(fp)))
+    {
+        free(buffer);
+        return NULL;
+    }
+    return buffer;
+}
+
+static int readLines(const char *fileName, char *lines[], int max_line)
+{
+    FILE *file = fopen(fileName, "r");
+    char *s;
+    int i = 0;
+    int n = 0;
+
+    if (file == NULL)
+    {
+        printf("Open %s fail!\n", fileName);
+        return -1;
+    }
+
+    while ((s = readLine(file, s, &n)) != NULL)
+    {
+        lines[i++] = s;
+        if (i >= max_line)
+            break;
+    }
+    fclose(file);
+    return i;
+}
+
+static int loadLabelName(const char *locationFilename, char *label[])
+{
+    printf("load lable %s\n", locationFilename);
+    readLines(locationFilename, label, OBJ_CLASS_NUM);
+    return 0;
+}
+
+static float CalculateOverlap(float xmin0, float ymin0, float xmax0, float ymax0, float xmin1, float ymin1, float xmax1,
+                              float ymax1)
+{
+    float w = fmax(0.f, fmin(xmax0, xmax1) - fmax(xmin0, xmin1) + 1.0);
+    float h = fmax(0.f, fmin(ymax0, ymax1) - fmax(ymin0, ymin1) + 1.0);
+    float i = w * h;
+    float u = (xmax0 - xmin0 + 1.0) * (ymax0 - ymin0 + 1.0) + (xmax1 - xmin1 + 1.0) * (ymax1 - ymin1 + 1.0) - i;
+    return u <= 0.f ? 0.f : (i / u);
+}
+
+static int nms(int validCount, std::vector<float> &outputLocations, std::vector<int> classIds, std::vector<int> &order,
+               int filterId, float threshold)
+{
+    for (int i = 0; i < validCount; ++i)
+    {
+        if (order[i] == -1 || classIds[i] != filterId)
+        {
+            continue;
+        }
+        int n = order[i];
+        for (int j = i + 1; j < validCount; ++j)
+        {
+            int m = order[j];
+            if (m == -1 || classIds[i] != filterId)
+            {
+                continue;
+            }
+            float xmin0 = outputLocations[n * 4 + 0];
+            float ymin0 = outputLocations[n * 4 + 1];
+            float xmax0 = outputLocations[n * 4 + 0] + outputLocations[n * 4 + 2];
+            float ymax0 = outputLocations[n * 4 + 1] + outputLocations[n * 4 + 3];
+
+            float xmin1 = outputLocations[m * 4 + 0];
+            float ymin1 = outputLocations[m * 4 + 1];
+            float xmax1 = outputLocations[m * 4 + 0] + outputLocations[m * 4 + 2];
+            float ymax1 = outputLocations[m * 4 + 1] + outputLocations[m * 4 + 3];
+
+            float iou = CalculateOverlap(xmin0, ymin0, xmax0, ymax0, xmin1, ymin1, xmax1, ymax1);
+
+            if (iou > threshold)
+            {
+                order[j] = -1;
+            }
+        }
+    }
+    return 0;
+}
+
+static int quick_sort_indice_inverse(std::vector<float> &input, int left, int right, std::vector<int> &indices)
+{
+    float key;
+    int key_index;
+    int low = left;
+    int high = right;
+    if (left < right)
+    {
+        key_index = indices[left];
+        key = input[left];
+        while (low < high)
+        {
+            while (low < high && input[high] <= key)
+            {
+                high--;
+            }
+            input[low] = input[high];
+            indices[low] = indices[high];
+            while (low < high && input[low] >= key)
+            {
+                low++;
+            }
+            input[high] = input[low];
+            indices[high] = indices[low];
+        }
+        input[low] = key;
+        indices[low] = key_index;
+        quick_sort_indice_inverse(input, left, low - 1, indices);
+        quick_sort_indice_inverse(input, low + 1, right, indices);
+    }
+    return low;
+}
+
+void resize_by_opencv_fp(float *input_image, int input_width, int input_height, int boxes_num, float *output_image, int target_width, int target_height)
+{
+    for (int b = 0; b < boxes_num; b++)
+    {
+        cv::Mat src_image(input_height, input_width, CV_32F, &input_image[b * input_width * input_height]);
+        cv::Mat dst_image;
+        cv::resize(src_image, dst_image, cv::Size(target_width, target_height), 0, 0, cv::INTER_LINEAR);
+        memcpy(&output_image[b * target_width * target_height], dst_image.data, target_width * target_height * sizeof(float));
+    }
+}
+
+void resize_by_opencv_uint8(uint8_t *input_image, int input_width, int input_height, int boxes_num, uint8_t *output_image, int target_width, int target_height)
+{
+    for (int b = 0; b < boxes_num; b++)
+    {
+        cv::Mat src_image(input_height, input_width, CV_8U, &input_image[b * input_width * input_height]);
+        cv::Mat dst_image;
+        cv::resize(src_image, dst_image, cv::Size(target_width, target_height), 0, 0, cv::INTER_LINEAR);
+        memcpy(&output_image[b * target_width * target_height], dst_image.data, target_width * target_height * sizeof(uint8_t));
+    }
+}
+
+void crop_mask_fp(float *seg_mask, uint8_t *all_mask_in_one, float *boxes, int boxes_num, int *cls_id, int height, int width)
+{
+    for (int b = 0; b < boxes_num; b++)
+    {
+        float x1 = boxes[b * 4 + 0];
+        float y1 = boxes[b * 4 + 1];
+        float x2 = boxes[b * 4 + 2];
+        float y2 = boxes[b * 4 + 3];
+
+        for (int i = 0; i < height; i++)
+        {
+            for (int j = 0; j < width; j++)
+            {
+                if (j >= x1 && j < x2 && i >= y1 && i < y2)
+                {
+                    if (all_mask_in_one[i * width + j] == 0)
+                    {
+                        if (seg_mask[b * width * height + i * width + j] > 0)
+                        {
+                            all_mask_in_one[i * width + j] = (cls_id[b] + 1);
+                        }
+                        else
+                        {
+                            all_mask_in_one[i * width + j] = 0;
+                        }
+                    }
+                }
+            }
+        }
+    }
+}
+
+void crop_mask_uint8(uint8_t *seg_mask, uint8_t *all_mask_in_one, float *boxes, int boxes_num, int *cls_id, int height, int width)
+{
+    for (int b = 0; b < boxes_num; b++)
+    {
+        float x1 = boxes[b * 4 + 0];
+        float y1 = boxes[b * 4 + 1];
+        float x2 = boxes[b * 4 + 2];
+        float y2 = boxes[b * 4 + 3];
+
+        for (int i = 0; i < height; i++)
+        {
+            for (int j = 0; j < width; j++)
+            {
+                if (j >= x1 && j < x2 && i >= y1 && i < y2)
+                {
+                    if (all_mask_in_one[i * width + j] == 0)
+                    {
+                        if (seg_mask[b * width * height + i * width + j] > 0)
+                        {
+                            all_mask_in_one[i * width + j] = (cls_id[b] + 1);
+                        }
+                        else
+                        {
+                            all_mask_in_one[i * width + j] = 0;
+                        }
+                    }
+                }
+            }
+        }
+    }
+}
+
+void matmul_by_cpu_fp(std::vector<float> &A, float *B, float *C, int ROWS_A, int COLS_A, int COLS_B)
+{
+
+    float temp = 0;
+    for (int i = 0; i < ROWS_A; i++)
+    {
+        for (int j = 0; j < COLS_B; j++)
+        {
+            temp = 0;
+            for (int k = 0; k < COLS_A; k++)
+            {
+                temp += A[i * COLS_A + k] * B[k * COLS_B + j];
+            }
+            C[i * COLS_B + j] = temp;
+        }
+    }
+}
+
+void matmul_by_cpu_uint8(std::vector<float> &A, float *B, uint8_t *C, int ROWS_A, int COLS_A, int COLS_B)
+{
+
+    float temp = 0;
+    for (int i = 0; i < ROWS_A; i++)
+    {
+        for (int j = 0; j < COLS_B; j++)
+        {
+            temp = 0;
+            for (int k = 0; k < COLS_A; k++)
+            {
+                temp += A[i * COLS_A + k] * B[k * COLS_B + j];
+            }
+            if (temp > 0)
+            {
+                C[i * COLS_B + j] = 4;
+            }
+            else
+            {
+                C[i * COLS_B + j] = 0;
+            }
+        }
+    }
+}
+
+void seg_reverse(uint8_t *seg_mask, uint8_t *cropped_seg, uint8_t *seg_mask_real,
+                 int model_in_height, int model_in_width, int cropped_height, int cropped_width, int ori_in_height, int ori_in_width, int y_pad, int x_pad)
+{
+
+    if (y_pad == 0 && x_pad == 0 && ori_in_height == model_in_height && ori_in_width == model_in_width)
+    {
+        memcpy(seg_mask_real, seg_mask, ori_in_height * ori_in_width);
+        return;
+    }
+
+    int cropped_index = 0;
+    for (int i = 0; i < model_in_height; i++)
+    {
+        for (int j = 0; j < model_in_width; j++)
+        {
+            if (i >= y_pad && i < model_in_height - y_pad && j >= x_pad && j < model_in_width - x_pad)
+            {
+                int seg_index = i * model_in_width + j;
+                cropped_seg[cropped_index] = seg_mask[seg_index];
+                cropped_index++;
+            }
+        }
+    }
+    resize_by_opencv_uint8(cropped_seg, cropped_width, cropped_height, 1, seg_mask_real, ori_in_width, ori_in_height);
+}
+
+static int box_reverse(int position, int boundary, int pad, float scale)
+{
+    return (int)((clamp(position, 0, boundary) - pad) / scale);
+}
+
+static float sigmoid(float x) { return 1.0 / (1.0 + expf(-x)); }
+
+static float unsigmoid(float y) { return -1.0 * logf((1.0 / y) - 1.0); }
+
+inline static int32_t __clip(float val, float min, float max)
+{
+    float f = val <= min ? min : (val >= max ? max : val);
+    return f;
+}
+
+static float deqnt_affine_to_f32(uint8_t qnt, uint8_t zp, float scale)
+{
+    return ((float)qnt - (float)zp) * scale;
+}
+
+static uint8_t qnt_f32_to_affine(float f32, uint8_t zp, float scale)
+{
+    float dst_val = (f32 / scale) + zp;
+    uint8_t res = (uint8_t)__clip(dst_val, 0, 255);
+    return res;
+}
+
+static void compute_dfl(float *tensor, int dfl_len, float *box)
+{
+    for (int b = 0; b < 4; b++)
+    {
+        float exp_t[dfl_len];
+        float exp_sum = 0;
+        float acc_sum = 0;
+        for (int i = 0; i < dfl_len; i++)
+        {
+            exp_t[i] = exp(tensor[i + b * dfl_len]);
+            exp_sum += exp_t[i];
+        }
+
+        for (int i = 0; i < dfl_len; i++)
+        {
+            acc_sum += exp_t[i] / exp_sum * i;
+        }
+        box[b] = acc_sum;
+    }
+}
+
+static int process_u8(rknn_output *all_input, int input_id, int grid_h, int grid_w, int height, int width, int stride, int dfl_len,
+                      std::vector<float> &boxes, std::vector<float> &segments, float *proto, std::vector<float> &objProbs, std::vector<int> &classId, float threshold,
+                      rknn_app_context_t *app_ctx)
+{
+    int validCount = 0;
+    int grid_len = grid_h * grid_w;
+
+    // Skip if input_id is not 0, 4, 8, or 12
+    if (input_id % 4 != 0)
+    {
+        return validCount;
+    }
+
+    if (input_id == 12)
+    {
+        uint8_t *input_proto = (uint8_t *)all_input[input_id].buf;
+        uint8_t zp_proto = app_ctx->output_attrs[input_id].zp;
+        float scale_proto = app_ctx->output_attrs[input_id].scale;
+        for (int i = 0; i < PROTO_CHANNEL * PROTO_HEIGHT * PROTO_WEIGHT; i++)
+        {
+            proto[i] = deqnt_affine_to_f32(input_proto[i], zp_proto, scale_proto);
+        }
+        return validCount;
+    }
+
+    uint8_t *box_tensor = (uint8_t *)all_input[input_id].buf;
+    uint8_t box_zp = app_ctx->output_attrs[input_id].zp;
+    float box_scale = app_ctx->output_attrs[input_id].scale;
+
+    uint8_t *score_tensor = (uint8_t *)all_input[input_id + 1].buf;
+    uint8_t score_zp = app_ctx->output_attrs[input_id + 1].zp;
+    float score_scale = app_ctx->output_attrs[input_id + 1].scale;
+
+    uint8_t *score_sum_tensor = nullptr;
+    uint8_t score_sum_zp = 0;
+    float score_sum_scale = 1.0;
+    score_sum_tensor = (uint8_t *)all_input[input_id + 2].buf;
+    score_sum_zp = app_ctx->output_attrs[input_id + 2].zp;
+    score_sum_scale = app_ctx->output_attrs[input_id + 2].scale;
+
+    uint8_t *seg_tensor = (uint8_t *)all_input[input_id + 3].buf;
+    uint8_t seg_zp = app_ctx->output_attrs[input_id + 3].zp;
+    float seg_scale = app_ctx->output_attrs[input_id + 3].scale;
+
+    uint8_t score_thres_u8 = qnt_f32_to_affine(threshold, score_zp, score_scale);
+    uint8_t score_sum_thres_u8 = qnt_f32_to_affine(threshold, score_sum_zp, score_sum_scale);
+
+    for (int i = 0; i < grid_h; i++)
+    {
+        for (int j = 0; j < grid_w; j++)
+        {
+            int offset = i * grid_w + j;
+            int max_class_id = -1;
+
+            int offset_seg = i * grid_w + j;
+            uint8_t *in_ptr_seg = seg_tensor + offset_seg;
+
+            // for quick filtering through "score sum"
+            if (score_sum_tensor != nullptr)
+            {
+                if (score_sum_tensor[offset] < score_sum_thres_u8)
+                {
+                    continue;
+                }
+            }
+
+            uint8_t max_score = -score_zp;
+            for (int c = 0; c < OBJ_CLASS_NUM; c++)
+            {
+                if ((score_tensor[offset] > score_thres_u8) && (score_tensor[offset] > max_score))
+                {
+                    max_score = score_tensor[offset];
+                    max_class_id = c;
+                }
+                offset += grid_len;
+            }
+
+            // compute box
+            if (max_score > score_thres_u8)
+            {
+
+                for (int k = 0; k < PROTO_CHANNEL; k++)
+                {
+                    float seg_element_fp = deqnt_affine_to_f32(in_ptr_seg[(k)*grid_len], seg_zp, seg_scale);
+                    segments.push_back(seg_element_fp);
+                }
+
+                offset = i * grid_w + j;
+                float box[4];
+                float before_dfl[dfl_len * 4];
+                for (int k = 0; k < dfl_len * 4; k++)
+                {
+                    before_dfl[k] = deqnt_affine_to_f32(box_tensor[offset], box_zp, box_scale);
+                    offset += grid_len;
+                }
+                compute_dfl(before_dfl, dfl_len, box);
+
+                float x1, y1, x2, y2, w, h;
+                x1 = (-box[0] + j + 0.5) * stride;
+                y1 = (-box[1] + i + 0.5) * stride;
+                x2 = (box[2] + j + 0.5) * stride;
+                y2 = (box[3] + i + 0.5) * stride;
+                w = x2 - x1;
+                h = y2 - y1;
+                boxes.push_back(x1);
+                boxes.push_back(y1);
+                boxes.push_back(w);
+                boxes.push_back(h);
+
+                objProbs.push_back(deqnt_affine_to_f32(max_score, score_zp, score_scale));
+                classId.push_back(max_class_id);
+                validCount++;
+            }
+        }
+    }
+    return validCount;
+}
+
+static int process_fp32(rknn_output *all_input, int input_id, int grid_h, int grid_w, int height, int width, int stride, int dfl_len,
+                        std::vector<float> &boxes, std::vector<float> &segments, float *proto, std::vector<float> &objProbs, std::vector<int> &classId, float threshold)
+{
+    int validCount = 0;
+    int grid_len = grid_h * grid_w;
+
+    // Skip if input_id is not 0, 4, 8, or 12
+    if (input_id % 4 != 0)
+    {
+        return validCount;
+    }
+
+    if (input_id == 12)
+    {
+        float *input_proto = (float *)all_input[input_id].buf;
+        for (int i = 0; i < PROTO_CHANNEL * PROTO_HEIGHT * PROTO_WEIGHT; i++)
+        {
+            proto[i] = input_proto[i];
+        }
+        return validCount;
+    }
+
+    float *box_tensor = (float *)all_input[input_id].buf;
+    float *score_tensor = (float *)all_input[input_id + 1].buf;
+    float *score_sum_tensor = (float *)all_input[input_id + 2].buf;
+    float *seg_tensor = (float *)all_input[input_id + 3].buf;
+
+    for (int i = 0; i < grid_h; i++)
+    {
+        for (int j = 0; j < grid_w; j++)
+        {
+            int offset = i * grid_w + j;
+            int max_class_id = -1;
+
+            int offset_seg = i * grid_w + j;
+            float *in_ptr_seg = seg_tensor + offset_seg;
+
+            // for quick filtering through "score sum"
+            if (score_sum_tensor != nullptr)
+            {
+                if (score_sum_tensor[offset] < threshold)
+                {
+                    continue;
+                }
+            }
+
+            float max_score = 0;
+            for (int c = 0; c < OBJ_CLASS_NUM; c++)
+            {
+                if ((score_tensor[offset] > threshold) && (score_tensor[offset] > max_score))
+                {
+                    max_score = score_tensor[offset];
+                    max_class_id = c;
+                }
+                offset += grid_len;
+            }
+
+            // compute box
+            if (max_score > threshold)
+            {
+
+                for (int k = 0; k < PROTO_CHANNEL; k++)
+                {
+                    float seg_element_f32 = in_ptr_seg[(k)*grid_len];
+                    segments.push_back(seg_element_f32);
+                }
+
+                offset = i * grid_w + j;
+                float box[4];
+                float before_dfl[dfl_len * 4];
+                for (int k = 0; k < dfl_len * 4; k++)
+                {
+                    before_dfl[k] = box_tensor[offset];
+                    offset += grid_len;
+                }
+                compute_dfl(before_dfl, dfl_len, box);
+
+                float x1, y1, x2, y2, w, h;
+                x1 = (-box[0] + j + 0.5) * stride;
+                y1 = (-box[1] + i + 0.5) * stride;
+                x2 = (box[2] + j + 0.5) * stride;
+                y2 = (box[3] + i + 0.5) * stride;
+                w = x2 - x1;
+                h = y2 - y1;
+                boxes.push_back(x1);
+                boxes.push_back(y1);
+                boxes.push_back(w);
+                boxes.push_back(h);
+
+                objProbs.push_back(max_score);
+                classId.push_back(max_class_id);
+                validCount++;
+            }
+        }
+    }
+    return validCount;
+}
+
+int post_process(rknn_app_context_t *app_ctx, rknn_output *outputs, letterbox_t *letter_box, float conf_threshold, float nms_threshold, object_detect_result_list *od_results)
+{
+
+    std::vector<float> filterBoxes;
+    std::vector<float> objProbs;
+    std::vector<int> classId;
+
+    std::vector<float> filterSegments;
+    float proto[PROTO_CHANNEL * PROTO_HEIGHT * PROTO_WEIGHT];
+    std::vector<float> filterSegments_by_nms;
+
+    int model_in_width = app_ctx->model_width;
+    int model_in_height = app_ctx->model_height;
+
+    int validCount = 0;
+    int stride = 0;
+    int grid_h = 0;
+    int grid_w = 0;
+
+    memset(od_results, 0, sizeof(object_detect_result_list));
+
+    int dfl_len = app_ctx->output_attrs[0].dims[2] / 4;
+    int output_per_branch = app_ctx->io_num.n_output / 3; // default 3 branch
+
+    // process the outputs of rknn
+    for (int i = 0; i < 13; i++)
+    {
+        grid_h = app_ctx->output_attrs[i].dims[1];
+        grid_w = app_ctx->output_attrs[i].dims[0];
+        stride = model_in_height / grid_h;
+
+        if (app_ctx->is_quant)
+        {
+            validCount += process_u8(outputs, i, grid_h, grid_w, model_in_height, model_in_width, stride, dfl_len, filterBoxes, filterSegments, proto, objProbs,
+                                     classId, conf_threshold, app_ctx);
+        }
+        else
+        {
+            validCount += process_fp32(outputs, i, grid_h, grid_w, model_in_height, model_in_width, stride, dfl_len, filterBoxes, filterSegments, proto, objProbs,
+                                       classId, conf_threshold);
+        }
+    }
+
+    // nms
+    if (validCount <= 0)
+    {
+        return 0;
+    }
+    std::vector<int> indexArray;
+    for (int i = 0; i < validCount; ++i)
+    {
+        indexArray.push_back(i);
+    }
+
+    quick_sort_indice_inverse(objProbs, 0, validCount - 1, indexArray);
+
+    std::set<int> class_set(std::begin(classId), std::end(classId));
+
+    for (auto c : class_set)
+    {
+        nms(validCount, filterBoxes, classId, indexArray, c, nms_threshold);
+    }
+
+    int last_count = 0;
+    od_results->count = 0;
+
+    for (int i = 0; i < validCount; ++i)
+    {
+        if (indexArray[i] == -1 || last_count >= OBJ_NUMB_MAX_SIZE)
+        {
+            continue;
+        }
+        int n = indexArray[i];
+
+        float x1 = filterBoxes[n * 4 + 0];
+        float y1 = filterBoxes[n * 4 + 1];
+        float x2 = x1 + filterBoxes[n * 4 + 2];
+        float y2 = y1 + filterBoxes[n * 4 + 3];
+        int id = classId[n];
+        float obj_conf = objProbs[i];
+
+        for (int k = 0; k < PROTO_CHANNEL; k++)
+        {
+            filterSegments_by_nms.push_back(filterSegments[n * PROTO_CHANNEL + k]);
+        }
+
+        od_results->results[last_count].box.left = x1;
+        od_results->results[last_count].box.top = y1;
+        od_results->results[last_count].box.right = x2;
+        od_results->results[last_count].box.bottom = y2;
+
+        od_results->results[last_count].prop = obj_conf;
+        od_results->results[last_count].cls_id = id;
+        last_count++;
+    }
+    od_results->count = last_count;
+    int boxes_num = od_results->count;
+
+    float filterBoxes_by_nms[boxes_num * 4];
+    int cls_id[boxes_num];
+    for (int i = 0; i < boxes_num; i++)
+    {
+        // for crop_mask
+        filterBoxes_by_nms[i * 4 + 0] = od_results->results[i].box.left;   // x1;
+        filterBoxes_by_nms[i * 4 + 1] = od_results->results[i].box.top;    // y1;
+        filterBoxes_by_nms[i * 4 + 2] = od_results->results[i].box.right;  // x2;
+        filterBoxes_by_nms[i * 4 + 3] = od_results->results[i].box.bottom; // y2;
+        cls_id[i] = od_results->results[i].cls_id;
+
+        // get real box
+        od_results->results[i].box.left = box_reverse(od_results->results[i].box.left, model_in_width, letter_box->x_pad, letter_box->scale);
+        od_results->results[i].box.top = box_reverse(od_results->results[i].box.top, model_in_height, letter_box->y_pad, letter_box->scale);
+        od_results->results[i].box.right = box_reverse(od_results->results[i].box.right, model_in_width, letter_box->x_pad, letter_box->scale);
+        od_results->results[i].box.bottom = box_reverse(od_results->results[i].box.bottom, model_in_height, letter_box->y_pad, letter_box->scale);
+    }
+
+    TIMER timer;
+#ifdef USE_FP_RESIZE
+    timer.tik();
+    // compute the mask through Matmul
+    int ROWS_A = boxes_num;
+    int COLS_A = PROTO_CHANNEL;
+    int COLS_B = PROTO_HEIGHT * PROTO_WEIGHT;
+    float *matmul_out = (float *)malloc(boxes_num * PROTO_HEIGHT * PROTO_WEIGHT * sizeof(float));
+    matmul_by_cpu_fp(filterSegments_by_nms, proto, matmul_out, ROWS_A, COLS_A, COLS_B);
+    timer.tok();
+    timer.print_time("matmul_by_cpu_fp");
+
+    timer.tik();
+    // resize to (boxes_num, model_in_width, model_in_height)
+    float *seg_mask = (float *)malloc(boxes_num * model_in_height * model_in_width * sizeof(float));
+    resize_by_opencv_fp(matmul_out, PROTO_WEIGHT, PROTO_HEIGHT, boxes_num, seg_mask, model_in_width, model_in_height);
+    timer.tok();
+    timer.print_time("resize_by_opencv_fp");
+
+    timer.tik();
+    // crop mask
+    uint8_t *all_mask_in_one = (uint8_t *)malloc(model_in_height * model_in_width * sizeof(uint8_t));
+    memset(all_mask_in_one, 0, model_in_height * model_in_width * sizeof(uint8_t));
+    crop_mask_fp(seg_mask, all_mask_in_one, filterBoxes_by_nms, boxes_num, cls_id, model_in_height, model_in_width);
+    timer.tok();
+    timer.print_time("crop_mask_fp");
+#else
+    timer.tik();
+    // compute the mask through Matmul
+    int ROWS_A = boxes_num;
+    int COLS_A = PROTO_CHANNEL;
+    int COLS_B = PROTO_HEIGHT * PROTO_WEIGHT;
+    uint8_t *matmul_out = (uint8_t *)malloc(boxes_num * PROTO_HEIGHT * PROTO_WEIGHT * sizeof(uint8_t));
+    matmul_by_cpu_uint8(filterSegments_by_nms, proto, matmul_out, ROWS_A, COLS_A, COLS_B);
+    timer.tok();
+    timer.print_time("matmul_by_cpu_uint8");
+
+    timer.tik();
+    uint8_t *seg_mask = (uint8_t *)malloc(boxes_num * model_in_height * model_in_width * sizeof(uint8_t));
+    resize_by_opencv_uint8(matmul_out, PROTO_WEIGHT, PROTO_HEIGHT, boxes_num, seg_mask, model_in_width, model_in_height);
+    timer.tok();
+    timer.print_time("resize_by_opencv_uint8");
+
+    timer.tik();
+    // crop mask
+    uint8_t *all_mask_in_one = (uint8_t *)malloc(model_in_height * model_in_width * sizeof(uint8_t));
+    memset(all_mask_in_one, 0, model_in_height * model_in_width * sizeof(uint8_t));
+    crop_mask_uint8(seg_mask, all_mask_in_one, filterBoxes_by_nms, boxes_num, cls_id, model_in_height, model_in_width);
+    timer.tok();
+    timer.print_time("crop_mask_uint8");
+#endif
+
+    timer.tik();
+    // get real mask
+    int cropped_height = model_in_height - letter_box->y_pad * 2;
+    int cropped_width = model_in_width - letter_box->x_pad * 2;
+    int ori_in_height = app_ctx->input_image_height;
+    int ori_in_width = app_ctx->input_image_width;
+    int y_pad = letter_box->y_pad;
+    int x_pad = letter_box->x_pad;
+    uint8_t *cropped_seg_mask = (uint8_t *)malloc(cropped_height * cropped_width * sizeof(uint8_t));
+    uint8_t *real_seg_mask = (uint8_t *)malloc(ori_in_height * ori_in_width * sizeof(uint8_t));
+    seg_reverse(all_mask_in_one, cropped_seg_mask, real_seg_mask,
+                model_in_height, model_in_width, cropped_height, cropped_width, ori_in_height, ori_in_width, y_pad, x_pad);
+    od_results->results_seg[0].seg_mask = real_seg_mask;
+    free(all_mask_in_one);
+    free(cropped_seg_mask);
+    free(seg_mask);
+    free(matmul_out);
+    timer.tok();
+    timer.print_time("seg_reverse");
+
+    return 0;
+}
+
+int init_post_process()
+{
+    int ret = 0;
+    ret = loadLabelName(LABEL_NALE_TXT_PATH, labels);
+    if (ret < 0)
+    {
+        printf("Load %s failed!\n", LABEL_NALE_TXT_PATH);
+        return -1;
+    }
+    return 0;
+}
+
+char *coco_cls_to_name(int cls_id)
+{
+
+    if (cls_id >= OBJ_CLASS_NUM)
+    {
+        return "null";
+    }
+
+    if (labels[cls_id])
+    {
+        return labels[cls_id];
+    }
+
+    return "null";
+}
+
+void deinit_post_process()
+{
+    for (int i = 0; i < OBJ_CLASS_NUM; i++)
+    {
+        {
+            free(labels[i]);
+            labels[i] = nullptr;
+        }
+    }
+}
diff --git a/examples/yolov8_seg/cpp/rknpu1/yolov8_seg.cc b/examples/yolov8_seg/cpp/rknpu1/yolov8_seg.cc
new file mode 100644
index 0000000..13c42ff
--- /dev/null
+++ b/examples/yolov8_seg/cpp/rknpu1/yolov8_seg.cc
@@ -0,0 +1,240 @@
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+#include <math.h>
+
+#include "yolov8_seg.h"
+#include "common.h"
+#include "file_utils.h"
+#include "image_utils.h"
+
+static void dump_tensor_attr(rknn_tensor_attr *attr)
+{
+    printf("  index=%d, name=%s, n_dims=%d, dims=[%d, %d, %d, %d], n_elems=%d, size=%d, fmt=%s, type=%s, qnt_type=%s, "
+           "zp=%d, scale=%f\n",
+           attr->index, attr->name, attr->n_dims, attr->dims[3], attr->dims[2], attr->dims[1], attr->dims[0],
+           attr->n_elems, attr->size, get_format_string(attr->fmt), get_type_string(attr->type),
+           get_qnt_type_string(attr->qnt_type), attr->zp, attr->scale);
+}
+
+int init_yolov8_seg_model(const char *model_path, rknn_app_context_t *app_ctx)
+{
+
+    int ret;
+    int model_len = 0;
+    char *model;
+    rknn_context ctx = 0;
+
+    // Load RKNN Model
+    model_len = read_data_from_file(model_path, &model);
+    if (model == NULL)
+    {
+        printf("load_model fail!\n");
+        return -1;
+    }
+
+    ret = rknn_init(&ctx, model, model_len, 0);
+    free(model);
+    if (ret < 0)
+    {
+        printf("rknn_init fail! ret=%d\n", ret);
+        return -1;
+    }
+
+    // Get Model Input Output Number
+    rknn_input_output_num io_num;
+    ret = rknn_query(ctx, RKNN_QUERY_IN_OUT_NUM, &io_num, sizeof(io_num));
+    if (ret != RKNN_SUCC)
+    {
+        printf("rknn_query fail! ret=%d\n", ret);
+        return -1;
+    }
+    printf("model input num: %d, output num: %d\n", io_num.n_input, io_num.n_output);
+
+    // Get Model Input Info
+    printf("input tensors:\n");
+    rknn_tensor_attr input_attrs[io_num.n_input];
+    memset(input_attrs, 0, sizeof(input_attrs));
+    for (int i = 0; i < io_num.n_input; i++)
+    {
+        input_attrs[i].index = i;
+        ret = rknn_query(ctx, RKNN_QUERY_INPUT_ATTR, &(input_attrs[i]), sizeof(rknn_tensor_attr));
+        if (ret != RKNN_SUCC)
+        {
+            printf("rknn_query fail! ret=%d\n", ret);
+            return -1;
+        }
+        dump_tensor_attr(&(input_attrs[i]));
+    }
+
+    // Get Model Output Info
+    printf("output tensors:\n");
+    rknn_tensor_attr output_attrs[io_num.n_output];
+    memset(output_attrs, 0, sizeof(output_attrs));
+    for (int i = 0; i < io_num.n_output; i++)
+    {
+        output_attrs[i].index = i;
+        ret = rknn_query(ctx, RKNN_QUERY_OUTPUT_ATTR, &(output_attrs[i]), sizeof(rknn_tensor_attr));
+        if (ret != RKNN_SUCC)
+        {
+            printf("rknn_query fail! ret=%d\n", ret);
+            return -1;
+        }
+        dump_tensor_attr(&(output_attrs[i]));
+    }
+
+    // Set to context
+    app_ctx->rknn_ctx = ctx;
+
+    // TODO
+    if (output_attrs[0].qnt_type == RKNN_TENSOR_QNT_AFFINE_ASYMMETRIC && output_attrs[0].type != RKNN_TENSOR_FLOAT16)
+    {
+        app_ctx->is_quant = true;
+    }
+    else
+    {
+        app_ctx->is_quant = false;
+    }
+
+
+    app_ctx->io_num = io_num;
+    app_ctx->input_attrs = (rknn_tensor_attr *)malloc(io_num.n_input * sizeof(rknn_tensor_attr));
+    memcpy(app_ctx->input_attrs, input_attrs, io_num.n_input * sizeof(rknn_tensor_attr));
+    app_ctx->output_attrs = (rknn_tensor_attr *)malloc(io_num.n_output * sizeof(rknn_tensor_attr));
+    memcpy(app_ctx->output_attrs, output_attrs, io_num.n_output * sizeof(rknn_tensor_attr));
+
+    if (input_attrs[0].fmt == RKNN_TENSOR_NCHW)
+    {
+        printf("model is NCHW input fmt\n");
+        app_ctx->model_channel = input_attrs[0].dims[2];
+        app_ctx->model_height = input_attrs[0].dims[1];
+        app_ctx->model_width = input_attrs[0].dims[0];
+    }
+    else
+    {
+        printf("model is NHWC input fmt\n");
+        app_ctx->model_height = input_attrs[0].dims[2];
+        app_ctx->model_width = input_attrs[0].dims[1];
+        app_ctx->model_channel = input_attrs[0].dims[0];
+    }
+    printf("model input height=%d, width=%d, channel=%d\n",
+           app_ctx->model_height, app_ctx->model_width, app_ctx->model_channel);
+
+    return 0;
+}
+
+int release_yolov8_seg_model(rknn_app_context_t *app_ctx)
+{
+    if (app_ctx->input_attrs != NULL)
+    {
+        free(app_ctx->input_attrs);
+        app_ctx->input_attrs = NULL;
+    }
+    if (app_ctx->output_attrs != NULL)
+    {
+        free(app_ctx->output_attrs);
+        app_ctx->output_attrs = NULL;
+    }
+    if (app_ctx->rknn_ctx != 0)
+    {
+        rknn_destroy(app_ctx->rknn_ctx);
+        app_ctx->rknn_ctx = 0;
+    }
+    return 0;
+}
+
+int inference_yolov8_seg_model(rknn_app_context_t *app_ctx, image_buffer_t *img, object_detect_result_list *od_results)
+{
+    int ret;
+    image_buffer_t dst_img;
+    letterbox_t letter_box;
+    rknn_input inputs[app_ctx->io_num.n_input];
+    rknn_output outputs[app_ctx->io_num.n_output];
+    const float nms_threshold = NMS_THRESH;
+    const float box_conf_threshold = BOX_THRESH;
+    int bg_color = 114; // pad color for letterbox
+
+    if ((!app_ctx) || !(img) || (!od_results))
+    {
+        return -1;
+    }
+
+    memset(od_results, 0x00, sizeof(*od_results));
+    memset(&letter_box, 0, sizeof(letterbox_t));
+    memset(&dst_img, 0, sizeof(image_buffer_t));
+    memset(inputs, 0, sizeof(inputs));
+    memset(outputs, 0, sizeof(outputs));
+
+    // Pre Process
+    app_ctx->input_image_width = img->width;
+    app_ctx->input_image_height = img->height;
+    dst_img.width = app_ctx->model_width;
+    dst_img.height = app_ctx->model_height;
+    dst_img.format = IMAGE_FORMAT_RGB888;
+    dst_img.size = get_image_size(&dst_img);
+    dst_img.virt_addr = (unsigned char *)malloc(dst_img.size);
+    if (dst_img.virt_addr == NULL)
+    {
+        printf("malloc buffer size:%d fail!\n", dst_img.size);
+        return -1;
+    }
+
+    // letterbox
+    ret = convert_image_with_letterbox(img, &dst_img, &letter_box, bg_color);
+    if (ret < 0)
+    {
+        printf("convert_image_with_letterbox fail! ret=%d\n", ret);
+        goto out;
+    }
+
+    // Set Input Data
+    inputs[0].index = 0;
+    inputs[0].type = RKNN_TENSOR_UINT8;
+    inputs[0].fmt = RKNN_TENSOR_NHWC;
+    inputs[0].size = app_ctx->model_width * app_ctx->model_height * app_ctx->model_channel;
+    inputs[0].buf = dst_img.virt_addr;
+
+    ret = rknn_inputs_set(app_ctx->rknn_ctx, app_ctx->io_num.n_input, inputs);
+    if (ret < 0)
+    {
+        printf("rknn_input_set fail! ret=%d\n", ret);
+        goto out;
+    }
+
+    // Run
+    printf("rknn_run\n");
+    ret = rknn_run(app_ctx->rknn_ctx, nullptr);
+    if (ret < 0)
+    {
+        printf("rknn_run fail! ret=%d\n", ret);
+        goto out;
+    }
+
+    // Get Output
+    memset(outputs, 0, sizeof(outputs));
+    for (int i = 0; i < app_ctx->io_num.n_output; i++)
+    {
+        outputs[i].index = i;
+        outputs[i].want_float = (!app_ctx->is_quant);
+    }
+    ret = rknn_outputs_get(app_ctx->rknn_ctx, app_ctx->io_num.n_output, outputs, NULL);
+    if (ret < 0)
+    {
+        printf("rknn_outputs_get fail! ret=%d\n", ret);
+        goto out;
+    }
+
+    // Post Process
+    post_process(app_ctx, outputs, &letter_box, box_conf_threshold, nms_threshold, od_results);
+
+    // Remeber to release rknn output
+    rknn_outputs_release(app_ctx->rknn_ctx, app_ctx->io_num.n_output, outputs);
+
+out:
+    if (dst_img.virt_addr != NULL)
+    {
+        free(dst_img.virt_addr);
+    }
+
+    return ret;
+}
\ No newline at end of file
diff --git a/examples/yolov8_seg/cpp/postprocess.cc b/examples/yolov8_seg/cpp/rknpu2/postprocess.cc
similarity index 72%
rename from examples/yolov8_seg/cpp/postprocess.cc
rename to examples/yolov8_seg/cpp/rknpu2/postprocess.cc
index 819adfe..777a7fa 100644
--- a/examples/yolov8_seg/cpp/postprocess.cc
+++ b/examples/yolov8_seg/cpp/rknpu2/postprocess.cc
@@ -23,20 +23,18 @@
 #include <opencv2/opencv.hpp>
 #include "rknn_matmul_api.h"
 #include "im2d.hpp"
-#include "dma_alloc.cpp"
-#include "drm_alloc.cpp"
+#include "dma_alloc.hpp"
+#include "drm_alloc.hpp"
 #include "Float16.h"
+#include "easy_timer.h"
 
 #include <set>
 #include <vector>
 #define LABEL_NALE_TXT_PATH "./model/coco_80_labels_list.txt"
+// #define USE_FP_RESIZE
 
 static char *labels[OBJ_CLASS_NUM];
 
-const int anchor[3][6] = {{10, 13, 16, 30, 33, 23},
-                          {30, 61, 62, 45, 59, 119},
-                          {116, 90, 156, 198, 373, 326}};
-
 int clamp(float val, int min, int max)
 {
     return val > min ? (val < max ? val : max) : min;
@@ -190,15 +188,29 @@ static int quick_sort_indice_inverse(std::vector<float> &input, int left, int ri
     return low;
 }
 
-static void resize_by_opencv(uint8_t *input_image, int input_width, int input_height, uint8_t *output_image, int target_width, int target_height)
+void resize_by_opencv_fp(float *input_image, int input_width, int input_height, int boxes_num, float *output_image, int target_width, int target_height)
 {
-    cv::Mat src_image(input_height, input_width, CV_8U, input_image);
-    cv::Mat dst_image;
-    cv::resize(src_image, dst_image, cv::Size(target_width, target_height), 0, 0, cv::INTER_LINEAR);
-    memcpy(output_image, dst_image.data, target_width * target_height);
+    for (int b = 0; b < boxes_num; b++)
+    {
+        cv::Mat src_image(input_height, input_width, CV_32F, &input_image[b * input_width * input_height]);
+        cv::Mat dst_image;
+        cv::resize(src_image, dst_image, cv::Size(target_width, target_height), 0, 0, cv::INTER_LINEAR);
+        memcpy(&output_image[b * target_width * target_height], dst_image.data, target_width * target_height * sizeof(float));
+    }
 }
 
-static void resize_by_rga_rk3588(uint8_t *input_image, int input_width, int input_height, uint8_t *output_image, int target_width, int target_height)
+void resize_by_opencv_uint8(uint8_t *input_image, int input_width, int input_height, int boxes_num, uint8_t *output_image, int target_width, int target_height)
+{
+    for (int b = 0; b < boxes_num; b++)
+    {
+        cv::Mat src_image(input_height, input_width, CV_8U, &input_image[b * input_width * input_height]);
+        cv::Mat dst_image;
+        cv::resize(src_image, dst_image, cv::Size(target_width, target_height), 0, 0, cv::INTER_LINEAR);
+        memcpy(&output_image[b * target_width * target_height], dst_image.data, target_width * target_height * sizeof(uint8_t));
+    }
+}
+
+void resize_by_rga_rk3588(uint8_t *input_image, int input_width, int input_height, uint8_t *output_image, int target_width, int target_height)
 {
     char *src_buf, *dst_buf;
     int src_buf_size, dst_buf_size;
@@ -230,7 +242,15 @@ static void resize_by_rga_rk3588(uint8_t *input_image, int input_width, int inpu
     dst = wrapbuffer_handle(dst_handle, dst_width, dst_height, dst_format);
     src = wrapbuffer_handle(src_handle, src_width, src_height, src_format);
 
-    imresize(src, dst);
+    int ret = imresize(src, dst);
+    if (ret == IM_STATUS_SUCCESS)
+    {
+        printf("%s running success!\n", "rga_resize");
+    }
+    else
+    {
+        printf("%s running failed, %s\n", "rga_resize", imStrError((IM_STATUS)ret));
+    }
 
     memcpy(output_image, dst_buf, target_width * target_height);
 
@@ -249,7 +269,7 @@ class DrmObject
     uint8_t *drm_buf;
 };
 
-static void resize_by_rga_rk356x(uint8_t *input_image, int input_width, int input_height, uint8_t *output_image, int target_width, int target_height)
+void resize_by_rga_rk356x(uint8_t *input_image, int input_width, int input_height, uint8_t *output_image, int target_width, int target_height)
 {
     rga_buffer_handle_t src_handle, dst_handle;
     int src_width = input_width;
@@ -286,7 +306,15 @@ static void resize_by_rga_rk356x(uint8_t *input_image, int input_width, int inpu
     dst = wrapbuffer_handle(dst_handle, dst_width, dst_height, dst_format);
     src = wrapbuffer_handle(src_handle, src_width, src_height, src_format);
 
-    imresize(src, dst);
+    int ret = imresize(src, dst);
+    if (ret == IM_STATUS_SUCCESS)
+    {
+        printf("%s running success!\n", "rga_resize");
+    }
+    else
+    {
+        printf("%s running failed, %s\n", "rga_resize", imStrError((IM_STATUS)ret));
+    }
 
     memcpy(output_image, drm_dst.drm_buf, target_width * target_height);
 
@@ -296,7 +324,7 @@ static void resize_by_rga_rk356x(uint8_t *input_image, int input_width, int inpu
     drm_buf_destroy(drm_dst.drm_buffer_fd, drm_dst.drm_buffer_handle, drm_dst.drm_buf, drm_dst.actual_size);
 }
 
-static void crop_mask(uint8_t *seg_mask, uint8_t *all_mask_in_one, float *boxes, int boxes_num, int *cls_id, int height, int width)
+void crop_mask_fp(float *seg_mask, uint8_t *all_mask_in_one, float *boxes, int boxes_num, int *cls_id, int height, int width)
 {
     for (int b = 0; b < boxes_num; b++)
     {
@@ -313,7 +341,14 @@ static void crop_mask(uint8_t *seg_mask, uint8_t *all_mask_in_one, float *boxes,
                 {
                     if (all_mask_in_one[i * width + j] == 0)
                     {
-                        all_mask_in_one[i * width + j] = seg_mask[b * width * height + i * width + j] * (cls_id[b] + 1);
+                        if (seg_mask[b * width * height + i * width + j] > 0)
+                        {
+                            all_mask_in_one[i * width + j] = (cls_id[b] + 1);
+                        }
+                        else
+                        {
+                            all_mask_in_one[i * width + j] = 0;
+                        }
                     }
                 }
             }
@@ -321,83 +356,82 @@ static void crop_mask(uint8_t *seg_mask, uint8_t *all_mask_in_one, float *boxes,
     }
 }
 
-static void matmul_by_npu_i8(std::vector<float> &A_input, float *B_input, uint8_t *C_input, int ROWS_A, int COLS_A, int COLS_B, rknn_app_context_t *app_ctx)
+void crop_mask_uint8(uint8_t *seg_mask, uint8_t *all_mask_in_one, float *boxes, int boxes_num, int *cls_id, int height, int width)
 {
-    int B_layout = 0;
-    int AC_layout = 0;
-    int32_t M = 1;
-    int32_t K = COLS_A;
-    int32_t N = COLS_B;
-
-    rknn_matmul_ctx ctx;
-    rknn_matmul_info info;
-    memset(&info, 0, sizeof(rknn_matmul_info));
-    info.M = M;
-    info.K = K;
-    info.N = N;
-    info.type = RKNN_INT8_MM_INT8_TO_INT32;
-    info.B_layout = B_layout;
-    info.AC_layout = AC_layout;
-
-    rknn_matmul_io_attr io_attr;
-    memset(&io_attr, 0, sizeof(rknn_matmul_io_attr));
-
-    int8_t int8Vector_A[ROWS_A * COLS_A];
-    for (int i = 0; i < ROWS_A * COLS_A; ++i)
+    for (int b = 0; b < boxes_num; b++)
     {
-        int8Vector_A[i] = (int8_t)A_input[i];
-    }
+        float x1 = boxes[b * 4 + 0];
+        float y1 = boxes[b * 4 + 1];
+        float x2 = boxes[b * 4 + 2];
+        float y2 = boxes[b * 4 + 3];
 
-    int8_t int8Vector_B[COLS_A * COLS_B];
-    for (int i = 0; i < COLS_A * COLS_B; ++i)
-    {
-        int8Vector_B[i] = (int8_t)B_input[i];
+        for (int i = 0; i < height; i++)
+        {
+            for (int j = 0; j < width; j++)
+            {
+                if (j >= x1 && j < x2 && i >= y1 && i < y2)
+                {
+                    if (all_mask_in_one[i * width + j] == 0)
+                    {
+                        if (seg_mask[b * width * height + i * width + j] > 0)
+                        {
+                            all_mask_in_one[i * width + j] = (cls_id[b] + 1);
+                        }
+                        else
+                        {
+                            all_mask_in_one[i * width + j] = 0;
+                        }
+                    }
+                }
+            }
+        }
     }
+}
 
-    int ret = rknn_matmul_create(&ctx, &info, &io_attr);
-    // Create A
-    rknn_tensor_mem *A = rknn_create_mem(ctx, io_attr.A.size);
-    // Create B
-    rknn_tensor_mem *B = rknn_create_mem(ctx, io_attr.B.size);
-    // Create C
-    rknn_tensor_mem *C = rknn_create_mem(ctx, io_attr.C.size);
-
-    memcpy(B->virt_addr, int8Vector_B, B->size);
-    // Set A
-    ret = rknn_matmul_set_io_mem(ctx, A, &io_attr.A);
-    // Set B
-    ret = rknn_matmul_set_io_mem(ctx, B, &io_attr.B);
-    // Set C
-    ret = rknn_matmul_set_io_mem(ctx, C, &io_attr.C);
+void matmul_by_cpu_fp(std::vector<float> &A, float *B, float *C, int ROWS_A, int COLS_A, int COLS_B)
+{
 
-    for (int i = 0; i < ROWS_A; ++i)
+    float temp = 0;
+    for (int i = 0; i < ROWS_A; i++)
     {
-        memcpy(A->virt_addr, int8Vector_A + i * A->size, A->size);
+        for (int j = 0; j < COLS_B; j++)
+        {
+            temp = 0;
+            for (int k = 0; k < COLS_A; k++)
+            {
+                temp += A[i * COLS_A + k] * B[k * COLS_B + j];
+            }
+            C[i * COLS_B + j] = temp;
+        }
+    }
+}
 
-        // Run
-        ret = rknn_matmul_run(ctx);
+void matmul_by_cpu_uint8(std::vector<float> &A, float *B, uint8_t *C, int ROWS_A, int COLS_A, int COLS_B)
+{
 
-        for (int j = 0; j < COLS_B; ++j)
+    float temp = 0;
+    for (int i = 0; i < ROWS_A; i++)
+    {
+        for (int j = 0; j < COLS_B; j++)
         {
-            if (((int32_t *)C->virt_addr)[j] > 0)
+            temp = 0;
+            for (int k = 0; k < COLS_A; k++)
+            {
+                temp += A[i * COLS_A + k] * B[k * COLS_B + j];
+            }
+            if (temp > 0)
             {
-                C_input[i * COLS_B + j] = 1;
+                C[i * COLS_B + j] = 4;
             }
             else
             {
-                C_input[i * COLS_B + j] = 0;
+                C[i * COLS_B + j] = 0;
             }
         }
     }
-
-    // destroy
-    rknn_destroy_mem(ctx, A);
-    rknn_destroy_mem(ctx, B);
-    rknn_destroy_mem(ctx, C);
-    rknn_matmul_destroy(ctx);
 }
 
-static void matmul_by_npu_fp16(std::vector<float> &A_input, float *B_input, uint8_t *C_input, int ROWS_A, int COLS_A, int COLS_B, rknn_app_context_t *app_ctx)
+void matmul_by_npu_fp(std::vector<float> &A_input, float *B_input, float *C_input, int ROWS_A, int COLS_A, int COLS_B, rknn_app_context_t *app_ctx)
 {
     int B_layout = 0;
     int AC_layout = 0;
@@ -452,14 +486,7 @@ static void matmul_by_npu_fp16(std::vector<float> &A_input, float *B_input, uint
     ret = rknn_matmul_run(ctx);
     for (int i = 0; i < ROWS_A * COLS_B; ++i)
     {
-        if (((float *)C->virt_addr)[i] > 0)
-        {
-            C_input[i] = 1;
-        }
-        else
-        {
-            C_input[i] = 0;
-        }
+        C_input[i] = ((float *)C->virt_addr)[i];
     }
 
     // destroy
@@ -469,25 +496,32 @@ static void matmul_by_npu_fp16(std::vector<float> &A_input, float *B_input, uint
     rknn_matmul_destroy(ctx);
 }
 
-static void seg_reverse(uint8_t *seg_mask, uint8_t *cropped_seg, uint8_t *seg_mask_real,
-                 int model_in_height, int model_in_width, int proto_height, int proto_width, int cropped_height, int cropped_width, int ori_in_height, int ori_in_width, int y_pad, int x_pad)
+void seg_reverse(uint8_t *seg_mask, uint8_t *cropped_seg, uint8_t *seg_mask_real,
+                 int model_in_height, int model_in_width, int cropped_height, int cropped_width, int ori_in_height, int ori_in_width, int y_pad, int x_pad)
 {
+
+    if (y_pad == 0 && x_pad == 0 && ori_in_height == model_in_height && ori_in_width == model_in_width)
+    {
+        memcpy(seg_mask_real, seg_mask, ori_in_height * ori_in_width);
+        return;
+    }
+
     int cropped_index = 0;
-    for (int i = 0; i < proto_height; i++)
+    for (int i = 0; i < model_in_height; i++)
     {
-        for (int j = 0; j < proto_width; j++)
+        for (int j = 0; j < model_in_width; j++)
         {
-            if (i >= y_pad && i < proto_height - y_pad && j >= x_pad && j < proto_width - x_pad)
+            if (i >= y_pad && i < model_in_height - y_pad && j >= x_pad && j < model_in_width - x_pad)
             {
-                int seg_index = i * proto_width + j;
+                int seg_index = i * model_in_width + j;
                 cropped_seg[cropped_index] = seg_mask[seg_index];
                 cropped_index++;
             }
         }
     }
-
-    // Note: Here are different methods provided for implementing single-channel image scaling
-    resize_by_opencv(cropped_seg, cropped_width, cropped_height, seg_mask_real, ori_in_width, ori_in_height);
+    // Note: Here are different methods provided for implementing single-channel image scaling.
+    //       The method of using rga to resize the image requires that the image size is 2 aligned.
+    resize_by_opencv_uint8(cropped_seg, cropped_width, cropped_height, 1, seg_mask_real, ori_in_width, ori_in_height);
     // resize_by_rga_rk356x(cropped_seg, cropped_width, cropped_height, seg_mask_real, ori_in_width, ori_in_height);
     // resize_by_rga_rk3588(cropped_seg, cropped_width, cropped_height, seg_mask_real, ori_in_width, ori_in_height);
 }
@@ -554,9 +588,10 @@ static int process_i8(rknn_output *all_input, int input_id, int grid_h, int grid
     {
         int8_t *input_proto = (int8_t *)all_input[input_id].buf;
         int32_t zp_proto = app_ctx->output_attrs[input_id].zp;
+        float scale_proto = app_ctx->output_attrs[input_id].scale;
         for (int i = 0; i < PROTO_CHANNEL * PROTO_HEIGHT * PROTO_WEIGHT; i++)
         {
-            proto[i] = input_proto[i] - zp_proto;
+            proto[i] = deqnt_affine_to_f32(input_proto[i], zp_proto, scale_proto);
         }
         return validCount;
     }
@@ -619,8 +654,8 @@ static int process_i8(rknn_output *all_input, int input_id, int grid_h, int grid
 
                 for (int k = 0; k < PROTO_CHANNEL; k++)
                 {
-                    int8_t seg_element_i8 = in_ptr_seg[(k)*grid_len] - seg_zp;
-                    segments.push_back(seg_element_i8);
+                    float seg_element_fp = deqnt_affine_to_f32(in_ptr_seg[(k)*grid_len], seg_zp, seg_scale);
+                    segments.push_back(seg_element_fp);
                 }
 
                 offset = i * grid_w + j;
@@ -763,8 +798,8 @@ int post_process(rknn_app_context_t *app_ctx, rknn_output *outputs, letterbox_t
     float proto[PROTO_CHANNEL * PROTO_HEIGHT * PROTO_WEIGHT];
     std::vector<float> filterSegments_by_nms;
 
-    int model_in_w = app_ctx->model_width;
-    int model_in_h = app_ctx->model_height;
+    int model_in_width = app_ctx->model_width;
+    int model_in_height = app_ctx->model_height;
 
     int validCount = 0;
     int stride = 0;
@@ -781,16 +816,16 @@ int post_process(rknn_app_context_t *app_ctx, rknn_output *outputs, letterbox_t
     {
         grid_h = app_ctx->output_attrs[i].dims[2];
         grid_w = app_ctx->output_attrs[i].dims[3];
-        stride = model_in_h / grid_h;
+        stride = model_in_height / grid_h;
 
         if (app_ctx->is_quant)
         {
-            validCount += process_i8(outputs, i, grid_h, grid_w, model_in_h, model_in_w, stride, dfl_len, filterBoxes, filterSegments, proto, objProbs,
+            validCount += process_i8(outputs, i, grid_h, grid_w, model_in_height, model_in_width, stride, dfl_len, filterBoxes, filterSegments, proto, objProbs,
                                      classId, conf_threshold, app_ctx);
         }
         else
         {
-            validCount += process_fp32(outputs, i, grid_h, grid_w, model_in_h, model_in_w, stride, dfl_len, filterBoxes, filterSegments, proto, objProbs,
+            validCount += process_fp32(outputs, i, grid_h, grid_w, model_in_height, model_in_width, stride, dfl_len, filterBoxes, filterSegments, proto, objProbs,
                                        classId, conf_threshold);
         }
     }
@@ -848,59 +883,99 @@ int post_process(rknn_app_context_t *app_ctx, rknn_output *outputs, letterbox_t
         last_count++;
     }
     od_results->count = last_count;
-
     int boxes_num = od_results->count;
 
-    // compute the mask (binary matrix) through Matmul
-    int ROWS_A = boxes_num;
-    int COLS_A = PROTO_CHANNEL;
-    int COLS_B = PROTO_HEIGHT * PROTO_WEIGHT;
-    uint8_t matmul_out[boxes_num * PROTO_HEIGHT * PROTO_WEIGHT];
-    if (app_ctx->is_quant)
-    {
-        matmul_by_npu_i8(filterSegments_by_nms, proto, matmul_out, ROWS_A, COLS_A, COLS_B, app_ctx);
-    }
-    else
-    {
-        matmul_by_npu_fp16(filterSegments_by_nms, proto, matmul_out, ROWS_A, COLS_A, COLS_B, app_ctx);
-    }
-
     float filterBoxes_by_nms[boxes_num * 4];
     int cls_id[boxes_num];
     for (int i = 0; i < boxes_num; i++)
     {
         // for crop_mask
-        // 640 / 160 = 4.0
-        filterBoxes_by_nms[i * 4 + 0] = od_results->results[i].box.left / 4.0;   // x1;
-        filterBoxes_by_nms[i * 4 + 1] = od_results->results[i].box.top / 4.0;    // y1;
-        filterBoxes_by_nms[i * 4 + 2] = od_results->results[i].box.right / 4.0;  // x2;
-        filterBoxes_by_nms[i * 4 + 3] = od_results->results[i].box.bottom / 4.0; // y2;
+        filterBoxes_by_nms[i * 4 + 0] = od_results->results[i].box.left;   // x1;
+        filterBoxes_by_nms[i * 4 + 1] = od_results->results[i].box.top;    // y1;
+        filterBoxes_by_nms[i * 4 + 2] = od_results->results[i].box.right;  // x2;
+        filterBoxes_by_nms[i * 4 + 3] = od_results->results[i].box.bottom; // y2;
         cls_id[i] = od_results->results[i].cls_id;
 
         // get real box
-        od_results->results[i].box.left = box_reverse(od_results->results[i].box.left, model_in_w, letter_box->x_pad, letter_box->scale);
-        od_results->results[i].box.top = box_reverse(od_results->results[i].box.top, model_in_h, letter_box->y_pad, letter_box->scale);
-        od_results->results[i].box.right = box_reverse(od_results->results[i].box.right, model_in_w, letter_box->x_pad, letter_box->scale);
-        od_results->results[i].box.bottom = box_reverse(od_results->results[i].box.bottom, model_in_h, letter_box->y_pad, letter_box->scale);
+        od_results->results[i].box.left = box_reverse(od_results->results[i].box.left, model_in_width, letter_box->x_pad, letter_box->scale);
+        od_results->results[i].box.top = box_reverse(od_results->results[i].box.top, model_in_height, letter_box->y_pad, letter_box->scale);
+        od_results->results[i].box.right = box_reverse(od_results->results[i].box.right, model_in_width, letter_box->x_pad, letter_box->scale);
+        od_results->results[i].box.bottom = box_reverse(od_results->results[i].box.bottom, model_in_height, letter_box->y_pad, letter_box->scale);
     }
 
-    // crop seg outside box
-    uint8_t all_mask_in_one[PROTO_HEIGHT * PROTO_WEIGHT] = {0};
-    crop_mask(matmul_out, all_mask_in_one, filterBoxes_by_nms, boxes_num, cls_id, PROTO_HEIGHT, PROTO_WEIGHT);
-
+    TIMER timer;
+#ifdef USE_FP_RESIZE
+    timer.tik();
+    // compute the mask through Matmul
+    int ROWS_A = boxes_num;
+    int COLS_A = PROTO_CHANNEL;
+    int COLS_B = PROTO_HEIGHT * PROTO_WEIGHT;
+    float *matmul_out = (float *)malloc(boxes_num * PROTO_HEIGHT * PROTO_WEIGHT * sizeof(float));
+    matmul_by_cpu_fp(filterSegments_by_nms, proto, matmul_out, ROWS_A, COLS_A, COLS_B);
+    // matmul_by_npu_fp(filterSegments_by_nms, proto, matmul_out, ROWS_A, COLS_A, COLS_B, app_ctx);
+    timer.tok();
+    timer.print_time("matmul_by_cpu_fp");
+
+    timer.tik();
+    // resize to (boxes_num, model_in_width, model_in_height)
+    float *seg_mask = (float *)malloc(boxes_num * model_in_height * model_in_width * sizeof(float));
+    resize_by_opencv_fp(matmul_out, PROTO_WEIGHT, PROTO_HEIGHT, boxes_num, seg_mask, model_in_width, model_in_height);
+    timer.tok();
+    timer.print_time("resize_by_opencv_fp");
+
+    timer.tik();
+    // crop mask
+    uint8_t *all_mask_in_one = (uint8_t *)malloc(model_in_height * model_in_width * sizeof(uint8_t));
+    memset(all_mask_in_one, 0, model_in_height * model_in_width * sizeof(uint8_t));
+    crop_mask_fp(seg_mask, all_mask_in_one, filterBoxes_by_nms, boxes_num, cls_id, model_in_height, model_in_width);
+    timer.tok();
+    timer.print_time("crop_mask_fp");
+#else
+    timer.tik();
+    // compute the mask through Matmul
+    int ROWS_A = boxes_num;
+    int COLS_A = PROTO_CHANNEL;
+    int COLS_B = PROTO_HEIGHT * PROTO_WEIGHT;
+    uint8_t *matmul_out = (uint8_t *)malloc(boxes_num * PROTO_HEIGHT * PROTO_WEIGHT * sizeof(uint8_t));
+    matmul_by_cpu_uint8(filterSegments_by_nms, proto, matmul_out, ROWS_A, COLS_A, COLS_B);
+
+    timer.tok();
+    timer.print_time("matmul_by_cpu_uint8");
+
+    timer.tik();
+    uint8_t *seg_mask = (uint8_t *)malloc(boxes_num * model_in_height * model_in_width * sizeof(uint8_t));
+    resize_by_opencv_uint8(matmul_out, PROTO_WEIGHT, PROTO_HEIGHT, boxes_num, seg_mask, model_in_width, model_in_height);
+    timer.tok();
+    timer.print_time("resize_by_opencv_uint8");
+
+    timer.tik();
+    // crop mask
+    uint8_t *all_mask_in_one = (uint8_t *)malloc(model_in_height * model_in_width * sizeof(uint8_t));
+    memset(all_mask_in_one, 0, model_in_height * model_in_width * sizeof(uint8_t));
+    crop_mask_uint8(seg_mask, all_mask_in_one, filterBoxes_by_nms, boxes_num, cls_id, model_in_height, model_in_width);
+    timer.tok();
+    timer.print_time("crop_mask_uint8");
+#endif
+
+    timer.tik();
     // get real mask
-    int cropped_height = PROTO_HEIGHT - letter_box->y_pad / 4 * 2;
-    int cropped_width = PROTO_WEIGHT - letter_box->x_pad / 4 * 2;
-    int y_pad = letter_box->y_pad / 4; // 640 / 160 = 4
-    int x_pad = letter_box->x_pad / 4;
-    int ori_in_height = (model_in_h - letter_box->y_pad * 2) / letter_box->scale;
-    int ori_in_width = (model_in_w - letter_box->x_pad * 2) / letter_box->scale;
+    int cropped_height = model_in_height - letter_box->y_pad * 2;
+    int cropped_width = model_in_width - letter_box->x_pad * 2;
+    int ori_in_height = app_ctx->input_image_height;
+    int ori_in_width = app_ctx->input_image_width;
+    int y_pad = letter_box->y_pad;
+    int x_pad = letter_box->x_pad;
     uint8_t *cropped_seg_mask = (uint8_t *)malloc(cropped_height * cropped_width * sizeof(uint8_t));
     uint8_t *real_seg_mask = (uint8_t *)malloc(ori_in_height * ori_in_width * sizeof(uint8_t));
     seg_reverse(all_mask_in_one, cropped_seg_mask, real_seg_mask,
-                model_in_h, model_in_w, PROTO_HEIGHT, PROTO_WEIGHT, cropped_height, cropped_width, ori_in_height, ori_in_width, y_pad, x_pad);
+                model_in_height, model_in_width, cropped_height, cropped_width, ori_in_height, ori_in_width, y_pad, x_pad);
     od_results->results_seg[0].seg_mask = real_seg_mask;
+    free(all_mask_in_one);
     free(cropped_seg_mask);
+    free(seg_mask);
+    free(matmul_out);
+    timer.tok();
+    timer.print_time("seg_reverse");
 
     return 0;
 }
diff --git a/examples/yolov8_seg/cpp/rknpu2/yolov8_seg.cc b/examples/yolov8_seg/cpp/rknpu2/yolov8_seg.cc
index 271fa39..03fb808 100644
--- a/examples/yolov8_seg/cpp/rknpu2/yolov8_seg.cc
+++ b/examples/yolov8_seg/cpp/rknpu2/yolov8_seg.cc
@@ -125,11 +125,6 @@ int init_yolov8_seg_model(const char *model_path, rknn_app_context_t *app_ctx)
 
 int release_yolov8_seg_model(rknn_app_context_t *app_ctx)
 {
-    if (app_ctx->rknn_ctx != 0)
-    {
-        rknn_destroy(app_ctx->rknn_ctx);
-        app_ctx->rknn_ctx = 0;
-    }
     if (app_ctx->input_attrs != NULL)
     {
         free(app_ctx->input_attrs);
@@ -140,6 +135,11 @@ int release_yolov8_seg_model(rknn_app_context_t *app_ctx)
         free(app_ctx->output_attrs);
         app_ctx->output_attrs = NULL;
     }
+    if (app_ctx->rknn_ctx != 0)
+    {
+        rknn_destroy(app_ctx->rknn_ctx);
+        app_ctx->rknn_ctx = 0;
+    }
     return 0;
 }
 
@@ -166,6 +166,8 @@ int inference_yolov8_seg_model(rknn_app_context_t *app_ctx, image_buffer_t *img,
     memset(outputs, 0, sizeof(outputs));
 
     // Pre Process
+    app_ctx->input_image_width = img->width;
+    app_ctx->input_image_height = img->height;
     dst_img.width = app_ctx->model_width;
     dst_img.height = app_ctx->model_height;
     dst_img.format = IMAGE_FORMAT_RGB888;
diff --git a/examples/yolov8_seg/cpp/yolov8_seg.h b/examples/yolov8_seg/cpp/yolov8_seg.h
index 03d940c..a2d81c9 100644
--- a/examples/yolov8_seg/cpp/yolov8_seg.h
+++ b/examples/yolov8_seg/cpp/yolov8_seg.h
@@ -27,6 +27,8 @@ typedef struct {
     int model_channel;
     int model_width;
     int model_height;
+    int input_image_width;
+    int input_image_height;
     bool is_quant;
 } rknn_app_context_t;
 
diff --git a/examples/yolov8_seg/model_comparison/yolov8_seg_graph_comparison.jpg b/examples/yolov8_seg/model_comparison/yolov8_seg_graph_comparison.jpg
new file mode 100644
index 0000000..8de992b
Binary files /dev/null and b/examples/yolov8_seg/model_comparison/yolov8_seg_graph_comparison.jpg differ
diff --git a/examples/yolov8_seg/model_comparison/yolov8_seg_output_comparison.jpg b/examples/yolov8_seg/model_comparison/yolov8_seg_output_comparison.jpg
new file mode 100644
index 0000000..8e8008a
Binary files /dev/null and b/examples/yolov8_seg/model_comparison/yolov8_seg_output_comparison.jpg differ
diff --git a/examples/yolov8_seg/python/convert.py b/examples/yolov8_seg/python/convert.py
index 6f3295f..b2f2d8a 100644
--- a/examples/yolov8_seg/python/convert.py
+++ b/examples/yolov8_seg/python/convert.py
@@ -1,52 +1,53 @@
-import os
 import sys
-import numpy as np
 from rknn.api import RKNN
 
 DATASET_PATH = '../../../datasets/COCO/coco_subset_20.txt'
+DEFAULT_RKNN_PATH = '../model/yolov8_seg.rknn'
+DEFAULT_QUANT = True
 
-if __name__ == '__main__':
-
+def parse_arg():
     if len(sys.argv) < 3:
-        print(
-            "Usage: python3 {} [onnx_model_path] [platform] [dtype(optional)] [output_rknn_path(optional)]".format(sys.argv[0]))
-        print("       platform choose from [rk3562,rk3566,rk3568,rk3588]")
-        print("       dtype choose from    [i8, fp]")
+        print("Usage: python3 {} onnx_model_path [platform] [dtype(optional)] [output_rknn_path(optional)]".format(sys.argv[0]));
+        print("       platform choose from [rk3562,rk3566,rk3568,rk3588,rk1808,rv1109,rv1126]")
+        print("       dtype choose from [i8, fp] for [rk3562,rk3566,rk3568,rk3588]")
+        print("       dtype choose from [u8, fp] for [rk1808,rv1109,rv1126]")
         exit(1)
 
     model_path = sys.argv[1]
     platform = sys.argv[2]
 
+    do_quant = DEFAULT_QUANT
     if len(sys.argv) > 3:
         model_type = sys.argv[3]
-        if model_type not in ['i8', 'fp']:
+        if model_type not in ['i8', 'u8', 'fp']:
             print("ERROR: Invalid model type: {}".format(model_type))
             exit(1)
-        elif model_type == 'i8':
+        elif model_type in ['i8', 'u8']:
             do_quant = True
         else:
             do_quant = False
-    else:
-        do_quant = True
 
     if len(sys.argv) > 4:
         output_path = sys.argv[4]
     else:
-        output_path = '../model/yolov8-seg.rknn'
+        output_path = DEFAULT_RKNN_PATH
+
+    return model_path, platform, do_quant, output_path
+
+if __name__ == '__main__':
+    model_path, platform, do_quant, output_path = parse_arg()
 
     # Create RKNN object
     rknn = RKNN(verbose=False)
 
     # Pre-process config
     print('--> Config model')
-    rknn.config(mean_values=[[0, 0, 0]], std_values=[
-                    [255, 255, 255]], target_platform=platform)
+    rknn.config(mean_values=[[0, 0, 0]], std_values=[[255, 255, 255]], target_platform=platform)
     print('done')
 
     # Load model
     print('--> Loading model')
     ret = rknn.load_onnx(model=model_path)
-    # ret = rknn.load_pytorch(model=model_path, input_size_list=[[1, 3, 640, 640]])
     if ret != 0:
         print('Load model failed!')
         exit(ret)
@@ -66,8 +67,8 @@
     if ret != 0:
         print('Export rknn model failed!')
         exit(ret)
-    print('--> The RKNN model saved in: {}'.format(output_path))
     print('done')
 
     # Release
     rknn.release()
+
diff --git a/examples/yolov8_seg/reference_results/yolov8s_seg_c_demo_result.jpg b/examples/yolov8_seg/reference_results/yolov8s_seg_c_demo_result.jpg
deleted file mode 100644
index 455e6b2..0000000
Binary files a/examples/yolov8_seg/reference_results/yolov8s_seg_c_demo_result.jpg and /dev/null differ
diff --git a/examples/yolov8_seg/reference_results/yolov8s_seg_c_demo_result.png b/examples/yolov8_seg/reference_results/yolov8s_seg_c_demo_result.png
new file mode 100644
index 0000000..267314e
Binary files /dev/null and b/examples/yolov8_seg/reference_results/yolov8s_seg_c_demo_result.png differ
diff --git a/examples/yolov8_seg/reference_results/yolov8s_seg_python_demo_result.jpg b/examples/yolov8_seg/reference_results/yolov8s_seg_python_demo_result.jpg
deleted file mode 100644
index 42626b6..0000000
Binary files a/examples/yolov8_seg/reference_results/yolov8s_seg_python_demo_result.jpg and /dev/null differ
diff --git a/examples/yolov8_seg/reference_results/yolov8s_seg_python_demo_result.png b/examples/yolov8_seg/reference_results/yolov8s_seg_python_demo_result.png
new file mode 100644
index 0000000..4efa29f
Binary files /dev/null and b/examples/yolov8_seg/reference_results/yolov8s_seg_python_demo_result.png differ
diff --git a/examples/yolox/README.md b/examples/yolox/README.md
index 6b824c7..2e3dc10 100644
--- a/examples/yolox/README.md
+++ b/examples/yolox/README.md
@@ -29,7 +29,7 @@ https://github.com/airockchip/YOLOX
 
 ## 2. Current Support Platform
 
-RK3566, RK3568, RK3588, RK3562
+RK3566, RK3568, RK3588, RK3562, RK1808, RV1109, RV1126
 
 
 
@@ -46,6 +46,18 @@ cd model
 ./download_model.sh
 ```
 
+**Note**: The model provided here is an optimized model, which is different from the official original model. Take yolox_s.onnx as an example to show the difference between them.
+1. The comparison of their output information is as follows. The left is the official original model, and the right is the optimized model. As shown in the figure, the original one output is split into three outputs.
+
+<div align=center>
+  <img src="./model_comparison/yolox_output_comparison.jpg" alt="Image">
+</div>
+
+2. We remove the subgraph following the three concat nodes in the model, and keep the outputs of these three concat nodes([1,85,80,80],[1,85,40,40],[1,85,20,20]).
+
+<div align=center>
+  <img src="./model_comparison/yolox_graph_comparison.jpg" alt="Image">
+</div>
 
 
 ## 4. Convert to RKNN
@@ -65,7 +77,7 @@ python convert.py ../model/yolox_s.onnx rk3588
 
 - `<onnx_model>`: Specify ONNX model path.
 - `<TARGET_PLATFORM>`: Specify NPU platform name. Support Platform refer [here](#2 Current Support Platform).
-- `<dtype>(optional)`: Specify as `i8` or `fp`. `i8` for doing quantization, `fp` for no quantization. Default is `i8`.
+- `<dtype>(optional)`: Specify as `i8`, `u8` or `fp`. `i8`/`u8` for doing quantization, `fp` for no quantization. Default is `i8`.
 - `<output_rknn_path>(optional)`: Specify save path for the RKNN model, default save in the same directory as ONNX model with name `yolox.rknn`
 
 
@@ -93,30 +105,12 @@ python yolox.py --model_path <rknn_model> --target <TARGET_PLATFORM> --img_show
 
 ## 6. Android Demo
 
-#### 6.1 Compile and Build
+**Note: RK1808, RV1109, RV1126 does not support Android.**
 
-*Usage:*
-
-```sh
-# go back to the rknn_model_zoo root directory
-cd ../../
-export ANDROID_NDK_PATH=<android_ndk_path>
-
-./build-android.sh -t <TARGET_PLATFORM> -a <ARCH> -d yolox
-
-# such as 
-./build-android.sh -t rk3588 -a arm64-v8a -d yolox
-```
+#### 6.1 Compile and Build
 
-*Description:*
-
-- `<android_ndk_path>`: Specify Android NDK path.
-- `<TARGET_PLATFORM>`: Specify NPU platform name. Support Platform refer [here](#2 Current Support Platform).
-- `<ARCH>`: Specify device system architecture. To query device architecture, refer to the following command:
-	```shell
-	# Query architecture. For Android, ['arm64-v8a' or 'armeabi-v7a'] should shown in log.
-	adb shell cat /proc/version
-	```
+Please refer to the [Compilation_Environment_Setup_Guide](../../docs/Compilation_Environment_Setup_Guide.md#android-platform) document to setup a cross-compilation environment and complete the compilation of C/C++ Demo.  
+**Note: Please replace the model name with `yolox`.**
 
 #### 6.2 Push demo files to device
 
@@ -152,31 +146,8 @@ export LD_LIBRARY_PATH=./lib
 
 #### 7.1 Compile and Build
 
-*Usage:*
-
-```shell
-# go back to the rknn_model_zoo root directory
-cd ../../
-
-# if GCC_COMPILER not found while building, please set GCC_COMPILER path
-(optional)export GCC_COMPILER=<GCC_COMPILER_PATH>
-
-./build-linux.sh -t <TARGET_PLATFORM> -a <ARCH> -d yolox
-
-# such as 
-./build-linux.sh -t rk3588 -a aarch64 -d yolox
-```
-
-*Description:*
-
-- `<GCC_COMPILER_PATH>`: Specified as GCC_COMPILER path.
-- `<TARGET_PLATFORM>` : Specify NPU platform name. Support Platform refer [here](#2 Current Support Platform).
-- `<ARCH>`: Specify device system architecture. To query device architecture, refer to the following command: 
-  
-  ```shell
-  # Query architecture. For Linux, ['aarch64' or 'armhf'] should shown in log.
-  adb shell cat /proc/version
-  ```
+Please refer to the [Compilation_Environment_Setup_Guide](../../docs/Compilation_Environment_Setup_Guide.md#linux-platform) document to setup a cross-compilation environment and complete the compilation of C/C++ Demo.  
+**Note: Please replace the model name with `yolox`.**
 
 #### 7.2 Push demo files to device
 
@@ -222,4 +193,4 @@ person @ (79 327 118 518) 0.508
 
 <img src="result.png">
 
-- Note: Different platforms, different versions of tools and drivers may have slightly different results.
\ No newline at end of file
+- Note: Different platforms, different versions of tools and drivers may have slightly different results.
diff --git a/examples/yolox/cpp/CMakeLists.txt b/examples/yolox/cpp/CMakeLists.txt
index 27e55e0..8e01844 100644
--- a/examples/yolox/cpp/CMakeLists.txt
+++ b/examples/yolox/cpp/CMakeLists.txt
@@ -9,6 +9,21 @@ if (ENABLE_ASAN)
 	set (CMAKE_LINKER_FLAGS_DEBUG "${CMAKE_LINKER_FLAGS_DEBUG} -fno-omit-frame-pointer -fsanitize=address")
 endif ()
 
+set(rknpu_yolox_file rknpu2/yolox.cc)
+
+if (TARGET_SOC STREQUAL "rv1106" OR TARGET_SOC STREQUAL "rv1103")
+    add_definitions(-DRV1106_1103)
+    set(rknpu_yolox_file rknpu2/yolox_rv1106_1103.cc)
+    #dma
+    include_directories(${CMAKE_CURRENT_SOURCE_DIR}/../../../3rdparty/allocator/dma)
+endif()
+
+if(TARGET_SOC STREQUAL "rk1808" OR TARGET_SOC STREQUAL "rv1109" OR TARGET_SOC STREQUAL "rv1126")
+    add_definitions(-DRKNPU1)
+    set(rknpu_yolox_file rknpu1/yolox.cc)
+endif()
+
+
 add_subdirectory(${CMAKE_CURRENT_SOURCE_DIR}/../../../3rdparty/ 3rdparty.out)
 add_subdirectory(${CMAKE_CURRENT_SOURCE_DIR}/../../../utils/ utils.out)
 
@@ -19,14 +34,15 @@ file(GLOB SRCS ${CMAKE_CURRENT_SOURCE_DIR}/*.cc)
 add_executable(${PROJECT_NAME}
     main.cc
     postprocess.cc
-    rknpu2/yolox.cc
+    ${rknpu_yolox_file}
 )
 
 target_link_libraries(${PROJECT_NAME}
-    fileutils
     imageutils
+    fileutils
     imagedrawing
     ${LIBRKNNRT}
+    dl
 )
 
 if (CMAKE_SYSTEM_NAME STREQUAL "Android")
diff --git a/examples/yolox/cpp/main.cc b/examples/yolox/cpp/main.cc
index 18c98ee..57dea42 100644
--- a/examples/yolox/cpp/main.cc
+++ b/examples/yolox/cpp/main.cc
@@ -26,6 +26,10 @@
 #include "image_drawing.h"
 #include "easy_timer.h"
 
+#if defined(RV1106_1103) 
+    #include "dma_alloc.hpp"
+#endif
+
 /*-------------------------------------------
                   Main Function
 -------------------------------------------*/
@@ -63,6 +67,19 @@ int main(int argc, char **argv)
     image_buffer_t src_image;
     memset(&src_image, 0, sizeof(image_buffer_t));
     ret = read_image(image_path, &src_image);
+
+#if defined(RV1106_1103) 
+    //RV1106 rga requires that input and output bufs are memory allocated by dma
+    ret = dma_buf_alloc(RV1106_CMA_HEAP_PATH, src_image.size, &rknn_app_ctx.img_dma_buf.dma_buf_fd, 
+                       (void **) & (rknn_app_ctx.img_dma_buf.dma_buf_virt_addr));
+    memcpy(rknn_app_ctx.img_dma_buf.dma_buf_virt_addr, src_image.virt_addr, src_image.size);
+    dma_sync_cpu_to_device(rknn_app_ctx.img_dma_buf.dma_buf_fd);
+    free(src_image.virt_addr);
+    src_image.virt_addr = (unsigned char *)rknn_app_ctx.img_dma_buf.dma_buf_virt_addr;
+    src_image.fd = rknn_app_ctx.img_dma_buf.dma_buf_fd;
+    rknn_app_ctx.img_dma_buf.size = src_image.size;
+#endif
+
     if (ret != 0)
     {
         printf("read image fail! ret=%d image_path=%s\n", ret, image_path);
@@ -114,7 +131,12 @@ int main(int argc, char **argv)
 
     if (src_image.virt_addr != NULL)
     {
+#if defined(RV1106_1103) 
+        dma_buf_free(rknn_app_ctx.img_dma_buf.size, &rknn_app_ctx.img_dma_buf.dma_buf_fd, 
+                rknn_app_ctx.img_dma_buf.dma_buf_virt_addr);
+#else
         free(src_image.virt_addr);
+#endif
     }
 
     return 0;
diff --git a/examples/yolox/cpp/postprocess.cc b/examples/yolox/cpp/postprocess.cc
index cfef531..806d6e5 100644
--- a/examples/yolox/cpp/postprocess.cc
+++ b/examples/yolox/cpp/postprocess.cc
@@ -194,8 +194,74 @@ static int8_t qnt_f32_to_affine(float f32, int32_t zp, float scale)
     return res;
 }
 
+static uint8_t qnt_f32_to_affine_u8(float f32, int32_t zp, float scale)
+{
+    float dst_val = (f32 / scale) + zp;
+    uint8_t res = (uint8_t)__clip(dst_val, 0, 255);
+    return res;
+}
+
 static float deqnt_affine_to_f32(int8_t qnt, int32_t zp, float scale) { return ((float)qnt - (float)zp) * scale; }
 
+static float deqnt_affine_u8_to_f32(uint8_t qnt, int32_t zp, float scale) { return ((float)qnt - (float)zp) * scale; }
+
+static int process_u8(uint8_t *input, int grid_h, int grid_w, int height, int width, int stride,
+                      std::vector<float> &boxes, std::vector<float> &objProbs, std::vector<int> &classId, float threshold,
+                      int32_t zp, float scale)
+{
+    int validCount = 0;
+    int grid_len = grid_h * grid_w;
+    uint8_t thres_u8 = qnt_f32_to_affine_u8(threshold, zp, scale);
+
+    for (int i = 0; i < grid_h; ++i)
+    {
+        for (int j = 0; j < grid_w; ++j)
+        {
+            uint8_t box_confidence = input[4 * grid_len + i * grid_w + j];
+            if (box_confidence >= thres_u8)
+            {
+                int offset = i * grid_w + j;
+                uint8_t *in_ptr = input + offset;
+
+                uint8_t maxClassProbs = in_ptr[5 * grid_len];
+                int maxClassId = 0;
+                for (int k = 1; k < OBJ_CLASS_NUM; ++k)
+                {
+                    uint8_t prob = in_ptr[(5 + k) * grid_len];
+                    if (prob > maxClassProbs)
+                    {
+                        maxClassId = k;
+                        maxClassProbs = prob;
+                    }
+                }
+
+                if (maxClassProbs > thres_u8)
+                {
+                    float box_x = (deqnt_affine_u8_to_f32(*in_ptr, zp, scale));
+                    float box_y = (deqnt_affine_u8_to_f32(in_ptr[grid_len], zp, scale));
+                    float box_w = (deqnt_affine_u8_to_f32(in_ptr[2 * grid_len], zp, scale));
+                    float box_h = (deqnt_affine_u8_to_f32(in_ptr[3 * grid_len], zp, scale));
+                    box_x = (box_x + j) * (float)stride;
+                    box_y = (box_y + i) * (float)stride;
+                    box_w = exp(box_w) * stride;
+                    box_h = exp(box_h) * stride;
+                    box_x -= (box_w / 2.0);
+                    box_y -= (box_h / 2.0);
+
+                    objProbs.push_back((deqnt_affine_u8_to_f32(maxClassProbs, zp, scale)) * (deqnt_affine_u8_to_f32(box_confidence, zp, scale)));
+                    classId.push_back(maxClassId);
+                    validCount++;
+                    boxes.push_back(box_x);
+                    boxes.push_back(box_y);
+                    boxes.push_back(box_w);
+                    boxes.push_back(box_h);
+                }
+            }
+        }
+    }
+    return validCount;
+}
+
 static int process_i8(int8_t *input, int grid_h, int grid_w, int height, int width, int stride,
                       std::vector<float> &boxes, std::vector<float> &objProbs, std::vector<int> &classId, float threshold,
                       int32_t zp, float scale)
@@ -250,6 +316,60 @@ static int process_i8(int8_t *input, int grid_h, int grid_w, int height, int wid
     return validCount;
 }
 
+static int process_i8_rv1106(int8_t *input, int grid_h, int grid_w, int height, int width, int stride,
+                      std::vector<float> &boxes, std::vector<float> &objProbs, std::vector<int> &classId, float threshold,
+                      int32_t zp, float scale)
+{
+    int validCount = 0;
+    int grid_len = grid_h * grid_w;
+    int8_t thres_i8 = qnt_f32_to_affine(threshold, zp, scale);
+
+    for (int i = 0; i < grid_h; ++i) {
+        for (int j = 0; j < grid_w; ++j) {
+            int8_t box_confidence = input[4 + (i * grid_w + j) * PROP_BOX_SIZE];
+            if (box_confidence >= thres_i8) {
+                int offset = (i * grid_w + j)*PROP_BOX_SIZE;
+                int8_t *in_ptr = input + offset;
+
+                int8_t maxClassProbs = input[5 + (i * grid_w + j) * PROP_BOX_SIZE];
+                int maxClassId = 0;
+                for (int k = 1; k < OBJ_CLASS_NUM; ++k)
+                {
+                    int8_t prob = input[(5 + k) + (i * grid_w + j) * PROP_BOX_SIZE];
+                    if (prob > maxClassProbs)
+                    {
+                        maxClassId = k;
+                        maxClassProbs = prob;
+                    }
+                }
+
+                if (maxClassProbs > thres_i8)
+                {
+                    float box_x = (deqnt_affine_to_f32(*in_ptr, zp, scale));
+                    float box_y = (deqnt_affine_to_f32(in_ptr[1], zp, scale));
+                    float box_w = (deqnt_affine_to_f32(in_ptr[2], zp, scale));
+                    float box_h = (deqnt_affine_to_f32(in_ptr[3], zp, scale));
+                    box_x = (box_x + j) * (float)stride;
+                    box_y = (box_y + i) * (float)stride;
+                    box_w = exp(box_w) * stride;
+                    box_h = exp(box_h) * stride;
+                    box_x -= (box_w / 2.0);
+                    box_y -= (box_h / 2.0);
+
+                    objProbs.push_back((deqnt_affine_to_f32(maxClassProbs, zp, scale)) * (deqnt_affine_to_f32(box_confidence, zp, scale)));
+                    classId.push_back(maxClassId);
+                    validCount++;
+                    boxes.push_back(box_x);
+                    boxes.push_back(box_y);
+                    boxes.push_back(box_w);
+                    boxes.push_back(box_h);
+                }
+            }
+        }
+    }
+    return validCount;
+}
+
 static int process_fp32(float *input, int grid_h, int grid_w, int height, int width, int stride,
                         std::vector<float> &boxes, std::vector<float> &objProbs, std::vector<int> &classId, float threshold)
 {
@@ -303,8 +423,13 @@ static int process_fp32(float *input, int grid_h, int grid_w, int height, int wi
     return validCount;
 }
 
-int post_process(rknn_app_context_t *app_ctx, rknn_output *outputs, letterbox_t *letter_box, float conf_threshold, float nms_threshold, object_detect_result_list *od_results)
+int post_process(rknn_app_context_t *app_ctx, void *outputs, letterbox_t *letter_box, float conf_threshold, float nms_threshold, object_detect_result_list *od_results)
 {
+#if defined(RV1106_1103) 
+    rknn_tensor_mem **_outputs = (rknn_tensor_mem **)outputs;
+#else
+    rknn_output *_outputs = (rknn_output *)outputs;
+#endif
     std::vector<float> filterBoxes;
     std::vector<float> objProbs;
     std::vector<int> classId;
@@ -319,20 +444,46 @@ int post_process(rknn_app_context_t *app_ctx, rknn_output *outputs, letterbox_t
 
     for (int i = 0; i < 3; i++)
     {
+#if defined(RV1106_1103) 
+        grid_h = app_ctx->output_attrs[i].dims[1];
+        grid_w = app_ctx->output_attrs[i].dims[2];
+        stride = model_in_h / grid_h;
+        //RV1106 only support i8
+        if (app_ctx->is_quant) {
+            validCount += process_i8_rv1106((int8_t *)_outputs[i]->virt_addr, grid_h, grid_w, model_in_h, model_in_w, stride, filterBoxes, objProbs,
+                                     classId, conf_threshold, app_ctx->output_attrs[i].zp, app_ctx->output_attrs[i].scale);
+        }
+#elif defined(RKNPU1)
+        grid_h = app_ctx->output_attrs[i].dims[1];
+        grid_w = app_ctx->output_attrs[i].dims[0];
+        stride = model_in_h / grid_h;
+
+        if (app_ctx->is_quant)
+        {
+            validCount += process_u8((uint8_t *)_outputs[i].buf, grid_h, grid_w, model_in_h, model_in_w, stride, filterBoxes, objProbs,
+                                     classId, conf_threshold, app_ctx->output_attrs[i].zp, app_ctx->output_attrs[i].scale);
+        }
+        else
+        {
+            validCount += process_fp32((float *)_outputs[i].buf, grid_h, grid_w, model_in_h, model_in_w, stride, filterBoxes, objProbs,
+                                       classId, conf_threshold);
+        }
+#else
         grid_h = app_ctx->output_attrs[i].dims[2];
         grid_w = app_ctx->output_attrs[i].dims[3];
         stride = model_in_h / grid_h;
 
         if (app_ctx->is_quant)
         {
-            validCount += process_i8((int8_t *)outputs[i].buf, grid_h, grid_w, model_in_h, model_in_w, stride, filterBoxes, objProbs,
+            validCount += process_i8((int8_t *)_outputs[i].buf, grid_h, grid_w, model_in_h, model_in_w, stride, filterBoxes, objProbs,
                                      classId, conf_threshold, app_ctx->output_attrs[i].zp, app_ctx->output_attrs[i].scale);
         }
         else
         {
-            validCount += process_fp32((float *)outputs[i].buf, grid_h, grid_w, model_in_h, model_in_w, stride, filterBoxes, objProbs,
+            validCount += process_fp32((float *)_outputs[i].buf, grid_h, grid_w, model_in_h, model_in_w, stride, filterBoxes, objProbs,
                                        classId, conf_threshold);
         }
+#endif
     }
 
     // no object detect
diff --git a/examples/yolox/cpp/postprocess.h b/examples/yolox/cpp/postprocess.h
index dc77270..16594d0 100644
--- a/examples/yolox/cpp/postprocess.h
+++ b/examples/yolox/cpp/postprocess.h
@@ -12,6 +12,7 @@
 #define OBJ_CLASS_NUM 80
 #define NMS_THRESH 0.45
 #define BOX_THRESH 0.25
+#define PROP_BOX_SIZE (5 + OBJ_CLASS_NUM)
 
 // class rknn_app_context_t;
 
@@ -30,7 +31,7 @@ typedef struct {
 int init_post_process();
 void deinit_post_process();
 char *coco_cls_to_name(int cls_id);
-int post_process(rknn_app_context_t *app_ctx, rknn_output *outputs, letterbox_t *letter_box, float conf_threshold, float nms_threshold, object_detect_result_list *od_results);
+int post_process(rknn_app_context_t *app_ctx, void *outputs, letterbox_t *letter_box, float conf_threshold, float nms_threshold, object_detect_result_list *od_results);
 
 void deinitPostProcess();
 #endif //_RKNN_YOLOX_DEMO_POSTPROCESS_H_
diff --git a/examples/yolox/cpp/rknpu1/yolox.cc b/examples/yolox/cpp/rknpu1/yolox.cc
new file mode 100644
index 0000000..a5dca9b
--- /dev/null
+++ b/examples/yolox/cpp/rknpu1/yolox.cc
@@ -0,0 +1,268 @@
+// Copyright (c) 2023 by Rockchip Electronics Co., Ltd. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+#include <math.h>
+
+#include "yolox.h"
+#include "common.h"
+#include "file_utils.h"
+#include "image_utils.h"
+#include "easy_timer.h"
+
+static void dump_tensor_attr(rknn_tensor_attr *attr)
+{
+    printf("  index=%d, name=%s, n_dims=%d, dims=[%d, %d, %d, %d], n_elems=%d, size=%d, fmt=%s, type=%s, qnt_type=%s, "
+           "zp=%d, scale=%f\n",
+           attr->index, attr->name, attr->n_dims, attr->dims[3], attr->dims[2], attr->dims[1], attr->dims[0],
+           attr->n_elems, attr->size, get_format_string(attr->fmt), get_type_string(attr->type),
+           get_qnt_type_string(attr->qnt_type), attr->zp, attr->scale);
+}
+
+int init_yolox_model(const char *model_path, rknn_app_context_t *app_ctx)
+{
+    int ret;
+    int model_len = 0;
+    char *model;
+    rknn_context ctx = 0;
+
+    // Load RKNN Model
+    model_len = read_data_from_file(model_path, &model);
+    if (model == NULL)
+    {
+        printf("load_model fail!\n");
+        return -1;
+    }
+
+    ret = rknn_init(&ctx, model, model_len, 0);
+    free(model);
+    if (ret < 0)
+    {
+        printf("rknn_init fail! ret=%d\n", ret);
+        return -1;
+    }
+
+    // Get Model Input Output Number
+    rknn_input_output_num io_num;
+    ret = rknn_query(ctx, RKNN_QUERY_IN_OUT_NUM, &io_num, sizeof(io_num));
+    if (ret != RKNN_SUCC)
+    {
+        printf("rknn_query fail! ret=%d\n", ret);
+        return -1;
+    }
+    printf("model input num: %d, output num: %d\n", io_num.n_input, io_num.n_output);
+
+    // Get Model Input Info
+    printf("input tensors:\n");
+    rknn_tensor_attr input_attrs[io_num.n_input];
+    memset(input_attrs, 0, sizeof(input_attrs));
+    for (int i = 0; i < io_num.n_input; i++)
+    {
+        input_attrs[i].index = i;
+        ret = rknn_query(ctx, RKNN_QUERY_INPUT_ATTR, &(input_attrs[i]), sizeof(rknn_tensor_attr));
+        if (ret != RKNN_SUCC)
+        {
+            printf("rknn_query fail! ret=%d\n", ret);
+            return -1;
+        }
+        dump_tensor_attr(&(input_attrs[i]));
+    }
+
+    // Get Model Output Info
+    printf("output tensors:\n");
+    rknn_tensor_attr output_attrs[io_num.n_output];
+    memset(output_attrs, 0, sizeof(output_attrs));
+    for (int i = 0; i < io_num.n_output; i++)
+    {
+        output_attrs[i].index = i;
+        ret = rknn_query(ctx, RKNN_QUERY_OUTPUT_ATTR, &(output_attrs[i]), sizeof(rknn_tensor_attr));
+        if (ret != RKNN_SUCC)
+        {
+            printf("rknn_query fail! ret=%d\n", ret);
+            return -1;
+        }
+        dump_tensor_attr(&(output_attrs[i]));
+    }
+
+    // Set to context
+    app_ctx->rknn_ctx = ctx;
+
+    // TODO
+    if (output_attrs[0].qnt_type == RKNN_TENSOR_QNT_AFFINE_ASYMMETRIC && output_attrs[0].type != RKNN_TENSOR_FLOAT16)
+    {
+        app_ctx->is_quant = true;
+    }
+    else
+    {
+        app_ctx->is_quant = false;
+    }
+
+    app_ctx->io_num = io_num;
+    app_ctx->input_attrs = (rknn_tensor_attr *)malloc(io_num.n_input * sizeof(rknn_tensor_attr));
+    memcpy(app_ctx->input_attrs, input_attrs, io_num.n_input * sizeof(rknn_tensor_attr));
+    app_ctx->output_attrs = (rknn_tensor_attr *)malloc(io_num.n_output * sizeof(rknn_tensor_attr));
+    memcpy(app_ctx->output_attrs, output_attrs, io_num.n_output * sizeof(rknn_tensor_attr));
+
+    if (input_attrs[0].fmt == RKNN_TENSOR_NCHW)
+    {
+        printf("model is NCHW input fmt\n");
+        app_ctx->model_channel = input_attrs[0].dims[2];
+        app_ctx->model_height = input_attrs[0].dims[1];
+        app_ctx->model_width = input_attrs[0].dims[0];
+    }
+    else
+    {
+        printf("model is NHWC input fmt\n");
+        app_ctx->model_height = input_attrs[0].dims[2];
+        app_ctx->model_width = input_attrs[0].dims[1];
+        app_ctx->model_channel = input_attrs[0].dims[0];
+    }
+    printf("model input height=%d, width=%d, channel=%d\n",
+           app_ctx->model_height, app_ctx->model_width, app_ctx->model_channel);
+
+    return 0;
+}
+
+int release_yolox_model(rknn_app_context_t *app_ctx)
+{
+    if (app_ctx->input_attrs != NULL)
+    {
+        free(app_ctx->input_attrs);
+        app_ctx->input_attrs = NULL;
+    }
+    if (app_ctx->output_attrs != NULL)
+    {
+        free(app_ctx->output_attrs);
+        app_ctx->output_attrs = NULL;
+    }
+    if (app_ctx->rknn_ctx != 0)
+    {
+        rknn_destroy(app_ctx->rknn_ctx);
+        app_ctx->rknn_ctx = 0;
+    }
+    return 0;
+}
+
+int inference_yolox_model(rknn_app_context_t *app_ctx, image_buffer_t *img, object_detect_result_list *od_results)
+{
+    int ret;
+    image_buffer_t dst_img;
+    letterbox_t letter_box;
+    rknn_input inputs[app_ctx->io_num.n_input];
+    rknn_output outputs[app_ctx->io_num.n_output];
+    const float nms_threshold = NMS_THRESH;      // Default NMS threshold
+    const float box_conf_threshold = BOX_THRESH; // Default box threshold
+    int bg_color = 114;
+    TIMER timer;
+    timer.indent_set("");
+
+    if ((!app_ctx) || !(img) || (!od_results))
+    {
+        return -1;
+    }
+
+    memset(od_results, 0x00, sizeof(*od_results));
+    memset(&letter_box, 0, sizeof(letterbox_t));
+    memset(&dst_img, 0, sizeof(image_buffer_t));
+    memset(inputs, 0, sizeof(inputs));
+    memset(outputs, 0, sizeof(outputs));
+
+    // Pre Process
+    dst_img.width = app_ctx->model_width;
+    dst_img.height = app_ctx->model_height;
+    dst_img.format = IMAGE_FORMAT_RGB888;
+    dst_img.size = get_image_size(&dst_img);
+    dst_img.virt_addr = (unsigned char *)malloc(dst_img.size);
+    if (dst_img.virt_addr == NULL)
+    {
+        printf("malloc buffer size:%d fail!\n", dst_img.size);
+        return -1;
+    }
+
+    // letterbox
+    timer.tik();
+    ret = convert_image_with_letterbox(img, &dst_img, &letter_box, bg_color);
+    if (ret < 0)
+    {
+        printf("convert_image_with_letterbox fail! ret=%d\n", ret);
+        return -1;
+    }
+    timer.tok();
+    timer.print_time("convert_image_with_letterbox");
+
+    // Set Input Data
+    inputs[0].index = 0;
+    inputs[0].type = RKNN_TENSOR_UINT8;
+    inputs[0].fmt = RKNN_TENSOR_NHWC;
+    inputs[0].size = app_ctx->model_width * app_ctx->model_height * app_ctx->model_channel;
+    inputs[0].buf = dst_img.virt_addr;
+
+    timer.tik();
+    ret = rknn_inputs_set(app_ctx->rknn_ctx, app_ctx->io_num.n_input, inputs);
+    if (ret < 0)
+    {
+        printf("rknn_input_set fail! ret=%d\n", ret);
+        return -1;
+    }
+    timer.tok();
+    timer.print_time("rknn_inputs_set");
+
+    // Run
+    timer.tik();
+    ret = rknn_run(app_ctx->rknn_ctx, nullptr);
+    if (ret < 0)
+    {
+        printf("rknn_run fail! ret=%d\n", ret);
+        return -1;
+    }
+    timer.tok();
+    timer.print_time("rknn_run");
+
+    // Get Output
+    memset(outputs, 0, sizeof(outputs));
+    for (int i = 0; i < app_ctx->io_num.n_output; i++)
+    {
+        outputs[i].index = i;
+        outputs[i].want_float = (!app_ctx->is_quant);
+    }
+
+    timer.tik();
+    ret = rknn_outputs_get(app_ctx->rknn_ctx, app_ctx->io_num.n_output, outputs, NULL);
+    if (ret < 0)
+    {
+        printf("rknn_outputs_get fail! ret=%d\n", ret);
+        goto out;
+    }
+    timer.tok();
+    timer.print_time("rknn_outputs_get");
+
+    // Post Process
+    timer.tik();
+    post_process(app_ctx, outputs, &letter_box, box_conf_threshold, nms_threshold, od_results);
+    timer.tok();
+    timer.print_time("post_process");
+
+    // Remeber to release rknn output
+    rknn_outputs_release(app_ctx->rknn_ctx, app_ctx->io_num.n_output, outputs);
+
+out:
+    if (dst_img.virt_addr != NULL)
+    {
+        free(dst_img.virt_addr);
+    }
+
+    return ret;
+}
\ No newline at end of file
diff --git a/examples/yolox/cpp/rknpu2/yolox.cc b/examples/yolox/cpp/rknpu2/yolox.cc
index 0b6de55..90aa733 100644
--- a/examples/yolox/cpp/rknpu2/yolox.cc
+++ b/examples/yolox/cpp/rknpu2/yolox.cc
@@ -138,11 +138,6 @@ int init_yolox_model(const char *model_path, rknn_app_context_t *app_ctx)
 
 int release_yolox_model(rknn_app_context_t *app_ctx)
 {
-    if (app_ctx->rknn_ctx != 0)
-    {
-        rknn_destroy(app_ctx->rknn_ctx);
-        app_ctx->rknn_ctx = 0;
-    }
     if (app_ctx->input_attrs != NULL)
     {
         free(app_ctx->input_attrs);
@@ -153,6 +148,11 @@ int release_yolox_model(rknn_app_context_t *app_ctx)
         free(app_ctx->output_attrs);
         app_ctx->output_attrs = NULL;
     }
+    if (app_ctx->rknn_ctx != 0)
+    {
+        rknn_destroy(app_ctx->rknn_ctx);
+        app_ctx->rknn_ctx = 0;
+    }
     return 0;
 }
 
diff --git a/examples/yolox/cpp/rknpu2/yolox_rv1106_1103.cc b/examples/yolox/cpp/rknpu2/yolox_rv1106_1103.cc
new file mode 100644
index 0000000..0fbd11c
--- /dev/null
+++ b/examples/yolox/cpp/rknpu2/yolox_rv1106_1103.cc
@@ -0,0 +1,235 @@
+// Copyright (c) 2023 by Rockchip Electronics Co., Ltd. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+#include <math.h>
+
+#include "yolox.h"
+#include "common.h"
+#include "file_utils.h"
+#include "image_utils.h"
+#include "easy_timer.h"
+
+static void dump_tensor_attr(rknn_tensor_attr *attr)
+{
+    printf("  index=%d, name=%s, n_dims=%d, dims=[%d, %d, %d, %d], n_elems=%d, size=%d, fmt=%s, type=%s, qnt_type=%s, "
+           "zp=%d, scale=%f\n",
+           attr->index, attr->name, attr->n_dims, attr->dims[0], attr->dims[1], attr->dims[2], attr->dims[3],
+           attr->n_elems, attr->size, get_format_string(attr->fmt), get_type_string(attr->type),
+           get_qnt_type_string(attr->qnt_type), attr->zp, attr->scale);
+}
+
+int init_yolox_model(const char *model_path, rknn_app_context_t *app_ctx)
+{
+    int ret;
+    int model_len = 0;
+    char *model;
+    rknn_context ctx = 0;
+
+    ret = rknn_init(&ctx, (char *)model_path, 0, 0, NULL);
+    if (ret < 0)
+    {
+        printf("rknn_init fail! ret=%d\n", ret);
+        return -1;
+    }
+
+    // Get Model Input Output Number
+    rknn_input_output_num io_num;
+    ret = rknn_query(ctx, RKNN_QUERY_IN_OUT_NUM, &io_num, sizeof(io_num));
+    if (ret != RKNN_SUCC)
+    {
+        printf("rknn_query fail! ret=%d\n", ret);
+        return -1;
+    }
+    printf("model input num: %d, output num: %d\n", io_num.n_input, io_num.n_output);
+
+    // Get Model Input Info
+    printf("input tensors:\n");
+    rknn_tensor_attr input_attrs[io_num.n_input];
+    memset(input_attrs, 0, sizeof(input_attrs));
+    for (int i = 0; i < io_num.n_input; i++)
+    {
+        input_attrs[i].index = i;
+        ret = rknn_query(ctx, RKNN_QUERY_NATIVE_INPUT_ATTR, &(input_attrs[i]), sizeof(rknn_tensor_attr));
+        if (ret != RKNN_SUCC)
+        {
+            printf("rknn_query fail! ret=%d\n", ret);
+            return -1;
+        }
+        dump_tensor_attr(&(input_attrs[i]));
+    }
+
+    // Get Model Output Info
+    printf("output tensors:\n");
+    rknn_tensor_attr output_attrs[io_num.n_output];
+    memset(output_attrs, 0, sizeof(output_attrs));
+    for (int i = 0; i < io_num.n_output; i++)
+    {
+        output_attrs[i].index = i;
+        //When using the zero-copy API interface, query the native output tensor attribute
+        ret = rknn_query(ctx, RKNN_QUERY_NATIVE_NHWC_OUTPUT_ATTR, &(output_attrs[i]), sizeof(rknn_tensor_attr));
+        if (ret != RKNN_SUCC)
+        {
+            printf("rknn_query fail! ret=%d\n", ret);
+            return -1;
+        }
+        dump_tensor_attr(&(output_attrs[i]));
+    }
+
+    // default input type is int8 (normalize and quantize need compute in outside)
+    // if set uint8, will fuse normalize and quantize to npu
+    input_attrs[0].type = RKNN_TENSOR_UINT8;
+    // default fmt is NHWC,1106 npu only support NHWC in zero copy mode
+    input_attrs[0].fmt = RKNN_TENSOR_NHWC;
+    printf("input_attrs[0].size_with_stride=%d\n", input_attrs[0].size_with_stride);
+    app_ctx->input_mems[0] = rknn_create_mem(ctx, input_attrs[0].size_with_stride);
+
+    // Set input tensor memory
+    ret = rknn_set_io_mem(ctx, app_ctx->input_mems[0], &input_attrs[0]);
+    if (ret < 0) {
+        printf("input_mems rknn_set_io_mem fail! ret=%d\n", ret);
+        return -1;
+    }
+
+    // Set output tensor memory
+    for (uint32_t i = 0; i < io_num.n_output; ++i) {
+        app_ctx->output_mems[i] = rknn_create_mem(ctx, output_attrs[i].size_with_stride);
+        ret = rknn_set_io_mem(ctx, app_ctx->output_mems[i], &output_attrs[i]);
+        if (ret < 0) {
+            printf("output_mems rknn_set_io_mem fail! ret=%d\n", ret);
+            return -1;
+        }
+    }
+
+    // Set to context
+    app_ctx->rknn_ctx = ctx;
+
+    // TODO
+    if (output_attrs[0].qnt_type == RKNN_TENSOR_QNT_AFFINE_ASYMMETRIC)
+    {
+        app_ctx->is_quant = true;
+    }
+    else
+    {
+        app_ctx->is_quant = false;
+    }
+
+    app_ctx->io_num = io_num;
+    app_ctx->input_attrs = (rknn_tensor_attr *)malloc(io_num.n_input * sizeof(rknn_tensor_attr));
+    memcpy(app_ctx->input_attrs, input_attrs, io_num.n_input * sizeof(rknn_tensor_attr));
+    app_ctx->output_attrs = (rknn_tensor_attr *)malloc(io_num.n_output * sizeof(rknn_tensor_attr));
+    memcpy(app_ctx->output_attrs, output_attrs, io_num.n_output * sizeof(rknn_tensor_attr));
+
+    if (input_attrs[0].fmt == RKNN_TENSOR_NCHW) 
+    {
+        printf("model is NCHW input fmt\n");
+        app_ctx->model_channel = input_attrs[0].dims[1];
+        app_ctx->model_height  = input_attrs[0].dims[2];
+        app_ctx->model_width   = input_attrs[0].dims[3];
+    } else 
+    {
+        printf("model is NHWC input fmt\n");
+        app_ctx->model_height  = input_attrs[0].dims[1];
+        app_ctx->model_width   = input_attrs[0].dims[2];
+        app_ctx->model_channel = input_attrs[0].dims[3];
+    } 
+
+    printf("model input height=%d, width=%d, channel=%d\n",
+           app_ctx->model_height, app_ctx->model_width, app_ctx->model_channel);
+
+    return 0;
+}
+
+int release_yolox_model(rknn_app_context_t *app_ctx)
+{
+    if (app_ctx->input_attrs != NULL)
+    {
+        free(app_ctx->input_attrs);
+        app_ctx->input_attrs = NULL;
+    }
+    if (app_ctx->output_attrs != NULL)
+    {
+        free(app_ctx->output_attrs);
+        app_ctx->output_attrs = NULL;
+    }
+    for (int i = 0; i < app_ctx->io_num.n_input; i++) {
+        if (app_ctx->input_mems[i] != NULL) {
+            rknn_destroy_mem(app_ctx->rknn_ctx, app_ctx->input_mems[i]);
+        }
+    }
+    for (int i = 0; i < app_ctx->io_num.n_output; i++) {
+        if (app_ctx->output_mems[i] != NULL) {
+            rknn_destroy_mem(app_ctx->rknn_ctx, app_ctx->output_mems[i]);
+        }
+    }
+    if (app_ctx->rknn_ctx != 0)
+    {
+        rknn_destroy(app_ctx->rknn_ctx);
+        app_ctx->rknn_ctx = 0;
+    }
+    return 0;
+}
+
+int inference_yolox_model(rknn_app_context_t *app_ctx, image_buffer_t *img, object_detect_result_list *od_results)
+{
+    int ret;
+    image_buffer_t dst_img;
+    letterbox_t letter_box;
+    const float nms_threshold = NMS_THRESH;      // 默认的NMS阈值
+    const float box_conf_threshold = BOX_THRESH; // 默认的置信度阈值
+    int bg_color = 114;
+    
+    if ((!app_ctx) || !(img) || (!od_results))
+    {
+        return -1;
+    }
+    memset(od_results, 0x00, sizeof(*od_results));
+    memset(&letter_box, 0, sizeof(letterbox_t));
+    memset(&dst_img, 0, sizeof(image_buffer_t));
+
+    // Pre Process
+    dst_img.width = app_ctx->model_width;
+    dst_img.height = app_ctx->model_height;
+    dst_img.format = IMAGE_FORMAT_RGB888;
+    dst_img.size = get_image_size(&dst_img);
+    dst_img.fd = app_ctx->input_mems[0]->fd;
+    if (dst_img.virt_addr == NULL && dst_img.fd == 0)
+    {
+        printf("malloc buffer size:%d fail!\n", dst_img.size);
+        return -1;
+    }
+
+    // letterbox
+    ret = convert_image_with_letterbox(img, &dst_img, &letter_box, bg_color);
+    if (ret < 0)
+    {
+        printf("convert_image_with_letterbox fail! ret=%d\n", ret);
+        return -1;
+    }
+
+    // Run
+    printf("rknn_run\n");
+    ret = rknn_run(app_ctx->rknn_ctx, nullptr);
+    if (ret < 0) {
+        printf("rknn_run fail! ret=%d\n", ret);
+        return -1;
+    }
+
+    // Post Process
+    post_process(app_ctx, app_ctx->output_mems, &letter_box, box_conf_threshold, nms_threshold, od_results);
+out:
+    return ret;
+}
\ No newline at end of file
diff --git a/examples/yolox/cpp/yolox.h b/examples/yolox/cpp/yolox.h
index 579b003..c7ee3d4 100644
--- a/examples/yolox/cpp/yolox.h
+++ b/examples/yolox/cpp/yolox.h
@@ -19,11 +19,24 @@
 #include "rknn_api.h"
 #include "common.h"
 
+#if defined(RV1106_1103) 
+    typedef struct {
+        char *dma_buf_virt_addr;
+        int dma_buf_fd;
+        int size;
+    }rknn_dma_buf;
+#endif
+
 typedef struct {
     rknn_context rknn_ctx;
     rknn_input_output_num io_num;
     rknn_tensor_attr* input_attrs;
     rknn_tensor_attr* output_attrs;
+#if defined(RV1106_1103) 
+    rknn_tensor_mem* input_mems[1];
+    rknn_tensor_mem* output_mems[3];
+    rknn_dma_buf img_dma_buf;
+#endif
     int model_channel;
     int model_width;
     int model_height;
diff --git a/examples/yolox/model_comparison/yolox_graph_comparison.jpg b/examples/yolox/model_comparison/yolox_graph_comparison.jpg
new file mode 100644
index 0000000..6070c07
Binary files /dev/null and b/examples/yolox/model_comparison/yolox_graph_comparison.jpg differ
diff --git a/examples/yolox/model_comparison/yolox_output_comparison.jpg b/examples/yolox/model_comparison/yolox_output_comparison.jpg
new file mode 100644
index 0000000..d3615e4
Binary files /dev/null and b/examples/yolox/model_comparison/yolox_output_comparison.jpg differ
diff --git a/examples/yolox/python/convert.py b/examples/yolox/python/convert.py
index 9580c3e..1e7b5f0 100644
--- a/examples/yolox/python/convert.py
+++ b/examples/yolox/python/convert.py
@@ -1,6 +1,4 @@
-import os
 import sys
-import numpy as np
 from rknn.api import RKNN
 
 DATASET_PATH = '../../../datasets/COCO/coco_subset_20.txt'
@@ -10,8 +8,9 @@
 def parse_arg():
     if len(sys.argv) < 3:
         print("Usage: python3 {} onnx_model_path [platform] [dtype(optional)] [output_rknn_path(optional)]".format(sys.argv[0]));
-        print("       platform choose from [rk3562,rk3566,rk3568,rk3588]")
-        print("       dtype choose from    [i8, fp]")
+        print("       platform choose from [rk3562, rk3566, rk3568, rk3588, rk1808, rv1109, rv1126]")
+        print("       dtype choose from    [i8, fp] for [rk3562, rk3566, rk3568, rk3588]")
+        print("       dtype choose from    [u8, fp] for [rk1808, rv1109, rv1126]")
         exit(1)
 
     model_path = sys.argv[1]
@@ -20,10 +19,10 @@ def parse_arg():
     do_quant = DEFAULT_QUANT
     if len(sys.argv) > 3:
         model_type = sys.argv[3]
-        if model_type not in ['i8', 'fp']:
+        if model_type not in ['i8', 'u8', 'fp']:
             print("ERROR: Invalid model type: {}".format(model_type))
             exit(1)
-        elif model_type == 'i8':
+        elif model_type in ['i8', 'u8']:
             do_quant = True
         else:
             do_quant = False
diff --git a/scaling_frequency.sh b/scaling_frequency.sh
index 15420ee..83d51d5 100644
--- a/scaling_frequency.sh
+++ b/scaling_frequency.sh
@@ -4,7 +4,7 @@ freq_set_status=0
 
 usage()
 {
-    echo "USAGE: ./fixed_frequency.sh -c {chip_name} [-h]"
+    echo "USAGE: ./scaling_frequency.sh -c {chip_name} [-h]"
     echo "  -c:  chip_name, such as rv1126 / rk3588"
     echo "  -h:  Help"
 }
@@ -99,6 +99,11 @@ elif [ $chip_name == 'rk3588' ]; then
     CPU_freq=2256000
     NPU_freq=1000000000
     DDR_freq=2112000000
+elif [ $chip_name == 'rk3576' ]; then
+    seting_strategy=7
+    CPU_freq=2016000
+    NPU_freq=1000000000
+    DDR_freq=2112000000
 elif [ $chip_name == 'rv1106' ]; then
     seting_strategy=5
     CPU_freq=1608000
@@ -190,13 +195,13 @@ case $seting_strategy in
         echo "  Core4"
         echo userspace > /sys/devices/system/cpu/cpufreq/policy4/scaling_governor
         echo $CPU_freq > /sys/devices/system/cpu/cpufreq/policy4/scaling_setspeed
-        NPU_cur_freq=$(cat /sys/devices/system/cpu/cpu4/cpufreq/cpuinfo_cur_freq)
+        CPU_cur_freq=$(cat /sys/devices/system/cpu/cpu4/cpufreq/cpuinfo_cur_freq)
         print_and_compare_result $CPU_freq $CPU_cur_freq
 
         echo "  Core6"
         echo userspace > /sys/devices/system/cpu/cpufreq/policy6/scaling_governor
         echo $CPU_freq > /sys/devices/system/cpu/cpufreq/policy6/scaling_setspeed
-        DDR_cur_freq=$(cat /sys/devices/system/cpu/cpu6/cpufreq/cpuinfo_cur_freq)
+        CPU_cur_freq=$(cat /sys/devices/system/cpu/cpu6/cpufreq/cpuinfo_cur_freq)
         print_and_compare_result $CPU_freq $CPU_cur_freq
 
         echo "NPU: seting frequency"
@@ -273,6 +278,38 @@ case $seting_strategy in
         print_not_support_adjust DDR $DDR_freq $DDR_cur_freq
         ;;
 
+    # rk3576
+    7)
+        echo "CPU: seting frequency"
+        echo "  Core0"
+        echo userspace > /sys/devices/system/cpu/cpufreq/policy0/scaling_governor
+        echo 1800000 > /sys/devices/system/cpu/cpufreq/policy0/scaling_setspeed
+        CPU_cur_freq=$(cat /sys/devices/system/cpu/cpufreq/policy0/scaling_cur_freq)
+        print_and_compare_result 1800000 $CPU_cur_freq
+
+        echo "  Core4"
+        echo userspace > /sys/devices/system/cpu/cpufreq/policy4/scaling_governor
+        echo $CPU_freq > /sys/devices/system/cpu/cpufreq/policy4/scaling_setspeed
+        CPU_cur_freq=$(cat /sys/devices/system/cpu/cpufreq/policy4/scaling_cur_freq)
+        print_and_compare_result $CPU_freq $CPU_cur_freq
+
+        echo "NPU: seting frequency"
+        echo userspace > /sys/devices/platform/27700000.npu/devfreq/27700000.npu/governor
+        echo $NPU_freq > /sys/class/devfreq/27700000.npu/userspace/set_freq 
+        NPU_cur_freq=$(cat /sys/class/devfreq/27700000.npu/cur_freq)
+        print_and_compare_result $NPU_freq $NPU_cur_freq
+
+        echo "DDR: seting frequency"
+        if [ -e /sys/class/devfreq/dmc/governor ];then
+            echo userspace > /sys/class/devfreq/dmc/governor
+            echo $DDR_freq > /sys/class/devfreq/dmc/userspace/set_freq
+            DDR_cur_freq=$(cat /sys/class/devfreq/dmc/cur_freq)
+        else
+            DDR_cur_freq=$(cat /sys/kernel/debug/clk/clk_summary | grep scmi_clk_ddr | awk '{split($0,a," "); print a[5]}')
+        fi
+        print_not_support_adjust DDR $DDR_freq $DDR_cur_freq
+        ;;
+
     *)
         echo "seting strategy not implement now"
         ;;
@@ -291,4 +328,4 @@ if [ $freq_set_status == 0 ];then
     echo "Seting Success" >> ./freq_set_status
 else
     echo "Seting Failed" >> ./freq_set_status
-fi
\ No newline at end of file
+fi
diff --git a/utils/image_utils.c b/utils/image_utils.c
index 42ac7ec..6f3f4af 100644
--- a/utils/image_utils.c
+++ b/utils/image_utils.c
@@ -434,7 +434,7 @@ static int convert_image_cpu(image_buffer_t *src, image_buffer_t *dst, image_rec
             src_box_x, src_box_y, src_box_w, src_box_h,
             dst->virt_addr, dst->width, dst->height,
             dst_box_x, dst_box_y, dst_box_w, dst_box_h);
-    } else if (src->format == IMAGE_FORMAT_YUV420SP_NV12 || src->format == IMAGE_FORMAT_YUV420SP_NV12) {
+    } else if (src->format == IMAGE_FORMAT_YUV420SP_NV12 || src->format == IMAGE_FORMAT_YUV420SP_NV21) {
         reti = crop_and_scale_image_yuv420sp(src->virt_addr, src->width, src->height,
             src_box_x, src_box_y, src_box_w, src_box_h,
             dst->virt_addr, dst->width, dst->height,