diff --git a/include/tvm/runtime/c_runtime_api.h b/include/tvm/runtime/c_runtime_api.h
index 7f3a11d9ddab..6fb7a0f3f8b3 100644
--- a/include/tvm/runtime/c_runtime_api.h
+++ b/include/tvm/runtime/c_runtime_api.h
@@ -87,6 +87,7 @@ typedef enum {
   kFuncHandle = 10U,
   kStr = 11U,
   kBytes = 12U,
+  kNDArrayContainer = 13U,
   // Extension codes for other frameworks to integrate TVM PackedFunc.
   // To make sure each framework's id do not conflict, use first and
   // last sections to mark ranges.
@@ -121,6 +122,9 @@ typedef DLContext TVMContext;
  */
 typedef DLTensor TVMArray;
 
+/*! \brief the array handle */
+typedef TVMArray* TVMArrayHandle;
+
 /*!
  * \brief Union type of values
  *  being passed through API and function calls.
@@ -149,8 +153,6 @@ typedef void* TVMModuleHandle;
 typedef void* TVMFunctionHandle;
 /*! \brief Handle to hold return value. */
 typedef void* TVMRetValueHandle;
-/*! \brief the array handle */
-typedef TVMArray* TVMArrayHandle;
 /*!
  * \brief The stream that is specific to device
  * can be NULL, which indicates the default one.
diff --git a/include/tvm/runtime/ndarray.h b/include/tvm/runtime/ndarray.h
new file mode 100644
index 000000000000..dfb06255381a
--- /dev/null
+++ b/include/tvm/runtime/ndarray.h
@@ -0,0 +1,286 @@
+/*!
+ *  Copyright (c) 2017 by Contributors
+ * \file tvm/runtime/ndarray.h
+ * \brief Abstract device memory management API
+ */
+#ifndef TVM_RUNTIME_NDARRAY_H_
+#define TVM_RUNTIME_NDARRAY_H_
+
+#include <atomic>
+#include <vector>
+#include <utility>
+#include "./c_runtime_api.h"
+
+namespace tvm {
+namespace runtime {
+/*!
+ * \brief Managed NDArray.
+ *  The array is backed by reference counted blocks.
+ */
+class NDArray {
+ public:
+  // internal container type
+  struct Container;
+  /*! \brief default constructor */
+  NDArray() {}
+  /*!
+   * \brief cosntruct a NDArray that refers to data
+   * \param data The data this NDArray refers to
+   */
+  explicit inline NDArray(Container* data);
+  /*!
+   * \brief copy constructor
+   * \param other The value to be copied
+   */
+  inline NDArray(const NDArray& other);  // NOLINT(*)
+  /*!
+   * \brief move constructor
+   * \param other The value to be moved
+   */
+  NDArray(NDArray&& other) // NOLINT(*)
+      : data_(other.data_) {
+    other.data_ = nullptr;
+  }
+  /*! \brief destructor */
+  ~NDArray() {
+    this->reset();
+  }
+  /*!
+   * \brief Swap this array with another NDArray
+   * \param other The other NDArray
+   */
+  void swap(NDArray& other) {  // NOLINT(*)
+    std::swap(data_, other.data_);
+  }
+  /*!
+   * \brief copy assignmemt
+   * \param other The value to be assigned.
+   * \return reference to self.
+   */
+  NDArray& operator=(const NDArray& other) {  // NOLINT(*)
+    // copy-and-swap idiom
+    NDArray(other).swap(*this);  // NOLINT(*)
+    return *this;
+  }
+  /*!
+   * \brief move assignmemt
+   * \param other The value to be assigned.
+   * \return reference to self.
+   */
+  NDArray& operator=(NDArray&& other) {  // NOLINT(*)
+    // copy-and-swap idiom
+    NDArray(std::move(other)).swap(*this); // NOLINT(*)
+    return *this;
+  }
+  /*! \return If NDArray is defined */
+  bool defined() const {
+    return data_ != nullptr;
+  }
+  /*! \return If both NDArray reference the same container */
+  bool same_as(const NDArray& other) const {
+    return data_ == other.data_;
+  }
+  /*! \brief reset the content of NDArray to be nullptr */
+  inline void reset();
+  /*!
+   * \return the reference counter
+   * \note this number is approximate in multi-threaded setting.
+   */
+  inline int use_count() const;
+  /*! \return Pointer to content of DLTensor */
+  inline const DLTensor* operator->() const;
+  /*!
+   * \brief Copy data content from another array.
+   * \param other The source array to be copied from.
+   * \note The copy may happen asynchrously if it involves a GPU context.
+   *       TVMSynchronize is necessary.
+   */
+  inline void CopyFrom(DLTensor* other);
+  inline void CopyFrom(const NDArray& other);
+  /*!
+   * \brief Copy data content into another array.
+   * \param other The source array to be copied from.
+   * \note The copy may happen asynchrously if it involves a GPU context.
+   *       TVMSynchronize is necessary.
+   */
+  inline void CopyTo(DLTensor* other);
+  inline void CopyTo(const NDArray& other);
+  /*!
+   * \brief Create a NDArray that shares the data memory with the current one.
+   * \param shape The shape of the new array.
+   * \param dtype The data type of the new array.
+   * \note The memory size of new array must be smaller than the current one.
+   */
+  TVM_DLL NDArray CreateView(
+      std::vector<int64_t> shape, DLDataType dtype);
+  /*!
+   * \brief Create a reference view of NDArray that
+   *  represents as DLManagedTensor.
+   * \return A DLManagedTensor
+   */
+  TVM_DLL DLManagedTensor* ToDLPack() const;
+  /*!
+   * \brief Create an empty NDArray.
+   * \param shape The shape of the new array.
+   * \param dtype The data type of the new array.
+   * \param ctx The context of the Array.
+   * \return The created Array
+   */
+  TVM_DLL static NDArray Empty(std::vector<int64_t> shape,
+                               DLDataType dtype,
+                               DLContext ctx);
+  /*!
+   * \brief Create a NDArray backed by a dlpack tensor.
+   *
+   * This allows us to create a NDArray using the memory
+   * allocated by an external deep learning framework
+   * that is DLPack compatible.
+   *
+   * The memory is retained until the NDArray went out of scope.
+   *
+   * \return The created NDArray view.
+   */
+  TVM_DLL static NDArray FromDLPack(DLManagedTensor* tensor);
+  /*!
+   * \brief Function to copy data from one array to another.
+   * \param from The source array.
+   * \param to The target array.
+   * \param stream The stream used in copy.
+   */
+  TVM_DLL static void CopyFromTo(
+      DLTensor* from, DLTensor* to, TVMStreamHandle stream = nullptr);
+
+  // internal namespace
+  struct Internal;
+ private:
+  /*! \brief Internal Data content */
+  Container* data_{nullptr};
+  // enable internal functions
+  friend struct Internal;
+  friend class TVMRetValue;
+  friend class TVMArgsSetter;
+};
+
+/*!
+ * \brief Reference counted Container object used to back NDArray.
+ *
+ *  This object is DLTensor compatible:
+ *    the pointer to the NDArrayContainer can be directly
+ *    interpreted as a DLTensor*
+ *
+ * \note: do not use this function directly, use NDArray.
+ */
+struct NDArray::Container {
+ public:
+  // NOTE: the first part of this structure is the same as
+  // DLManagedTensor, note that, however, the deleter
+  // is only called when the reference counter goes to 0
+  /*!
+   * \brief The corresponding dl_tensor field.
+   * \note it is important that the first field is DLTensor
+   *  So that this data structure is DLTensor compatible.
+   *  The head ptr of this struct can be viewed as DLTensor*.
+   */
+  DLTensor dl_tensor;
+  /*!
+   * \brief addtional context, reserved for recycling
+   * \note We can attach additional content here
+   *  which the current container depend on
+   *  (e.g. reference to original memory when creating views).
+   */
+  void* manager_ctx{nullptr};
+  /*!
+   * \brief Customized deleter
+   *
+   * \note The customized deleter is helpful to enable
+   *  different ways of memory allocator that are not
+   *  currently defined by the system.
+   */
+  void (*deleter)(Container* self) = nullptr;
+  /*! \brief default constructor */
+  Container() {
+    dl_tensor.data = nullptr;
+    dl_tensor.ndim = 0;
+    dl_tensor.shape = nullptr;
+    dl_tensor.strides = nullptr;
+    dl_tensor.byte_offset = 0;
+  }
+  /*! \brief developer function, increases reference counter */
+  void IncRef() {
+    ref_counter_.fetch_add(1, std::memory_order_relaxed);
+  }
+  /*! \brief developer function, decrease reference counter */
+  void DecRef() {
+    if (ref_counter_.fetch_sub(1, std::memory_order_release) == 1) {
+      std::atomic_thread_fence(std::memory_order_acquire);
+      if (this->deleter != nullptr) {
+        (*this->deleter)(this);
+      }
+    }
+  }
+
+ private:
+  friend class NDArray;
+  /*!
+   * \brief The shape container,
+   *  can be used used for shape data.
+   */
+  std::vector<int64_t> shape_;
+  /*! \brief The internal array object */
+  std::atomic<int> ref_counter_{0};
+};
+
+// implementations of inline functions
+// the usages of functions are documented in place.
+inline NDArray::NDArray(Container* data)
+  : data_(data) {
+  data_->IncRef();
+}
+
+inline NDArray::NDArray(const NDArray& other)
+  : data_(other.data_) {
+  data_->IncRef();
+}
+
+inline void NDArray::reset() {
+  if (data_ != nullptr) {
+    data_->DecRef();
+    data_ = nullptr;
+  }
+}
+
+inline void NDArray::CopyFrom(DLTensor* other) {
+  CHECK(data_ != nullptr);
+  CopyFromTo(other, &(data_->dl_tensor));
+}
+
+inline void NDArray::CopyFrom(const NDArray& other) {
+  CHECK(data_ != nullptr);
+  CHECK(other.data_ != nullptr);
+  CopyFromTo(&(other.data_->dl_tensor), &(data_->dl_tensor));
+}
+
+inline void NDArray::CopyTo(DLTensor* other) {
+  CHECK(data_ != nullptr);
+  CopyFromTo(&(data_->dl_tensor), other);
+}
+
+inline void NDArray::CopyTo(const NDArray& other) {
+  CHECK(data_ != nullptr);
+  CHECK(other.data_ != nullptr);
+  CopyFromTo(&(data_->dl_tensor), &(other.data_->dl_tensor));
+}
+
+inline int NDArray::use_count() const {
+  if (data_ == nullptr) return 0;
+  return data_->ref_counter_.load(std::memory_order_relaxed);
+}
+
+inline const DLTensor* NDArray::operator->() const {
+  return &(data_->dl_tensor);
+}
+
+}  // namespace runtime
+}  // namespace tvm
+
+#endif  // TVM_RUNTIME_NDARRAY_H_
diff --git a/include/tvm/runtime/packed_func.h b/include/tvm/runtime/packed_func.h
index 16266b4f99c9..6d8df4a5e3d6 100644
--- a/include/tvm/runtime/packed_func.h
+++ b/include/tvm/runtime/packed_func.h
@@ -16,6 +16,7 @@
 #include <type_traits>
 #include "./c_runtime_api.h"
 #include "./module.h"
+#include "./ndarray.h"
 
 namespace HalideIR {
 // Forward declare type for extensions
@@ -249,10 +250,22 @@ class TVMPODValue_ {
     TVM_CHECK_TYPE_CODE(type_code_, kHandle);
     return value_.v_handle;
   }
-  operator TVMArray*() const {
-    if (type_code_ == kNull) return nullptr;
-    TVM_CHECK_TYPE_CODE(type_code_, kArrayHandle);
-    return static_cast<TVMArray*>(value_.v_handle);
+  operator DLTensor*() const {
+    if (type_code_ == kArrayHandle ||
+        type_code_ == kNDArrayContainer) {
+      return static_cast<DLTensor*>(value_.v_handle);
+    } else {
+      if (type_code_ == kNull) return nullptr;
+      LOG(FATAL) << "Expected "
+                 << "DLTensor* or NDArray but get "
+                 << TypeCode2Str(type_code_);
+      return nullptr;
+    }
+  }
+  operator NDArray() const {
+    if (type_code_ == kNull) return NDArray();
+    TVM_CHECK_TYPE_CODE(type_code_, kNDArrayContainer);
+    return NDArray(static_cast<NDArray::Container*>(value_.v_handle));
   }
   operator TVMContext() const {
     TVM_CHECK_TYPE_CODE(type_code_, kTVMContext);
@@ -312,8 +325,10 @@ class TVMArgValue : public TVMPODValue_ {
   using TVMPODValue_::operator int;
   using TVMPODValue_::operator bool;
   using TVMPODValue_::operator void*;
-  using TVMPODValue_::operator TVMArray*;
+  using TVMPODValue_::operator DLTensor*;
+  using TVMPODValue_::operator NDArray;
   using TVMPODValue_::operator TVMContext;
+
   // conversion operator.
   operator std::string() const {
     if (type_code_ == kTVMType) {
@@ -394,8 +409,9 @@ class TVMRetValue : public TVMPODValue_ {
   using TVMPODValue_::operator int;
   using TVMPODValue_::operator bool;
   using TVMPODValue_::operator void*;
-  using TVMPODValue_::operator TVMArray*;
+  using TVMPODValue_::operator DLTensor*;
   using TVMPODValue_::operator TVMContext;
+  using TVMPODValue_::operator NDArray;
   // Disable copy and assign from another value, but allow move.
   TVMRetValue(const TVMRetValue& other) {
     this->Assign(other);
@@ -477,6 +493,13 @@ class TVMRetValue : public TVMPODValue_ {
     this->SwitchToClass(kBytes, std::string(value.data, value.size));
     return *this;
   }
+  TVMRetValue& operator=(NDArray other) {
+    this->Clear();
+    type_code_ = kNDArrayContainer;
+    value_.v_handle = other.data_;
+    other.data_ = nullptr;
+    return *this;
+  }
   TVMRetValue& operator=(PackedFunc f) {
     this->SwitchToClass(kFuncHandle, f);
     return *this;
@@ -559,6 +582,10 @@ class TVMRetValue : public TVMPODValue_ {
         SwitchToClass<Module>(kModuleHandle, other);
         break;
       }
+      case kNDArrayContainer: {
+        *this = other.operator NDArray();
+        break;
+      }
       case kNodeHandle: {
         SwitchToClass<std::shared_ptr<Node> >(
             kNodeHandle, *other.template ptr<std::shared_ptr<Node> >());
@@ -607,6 +634,10 @@ class TVMRetValue : public TVMPODValue_ {
       case kFuncHandle: delete ptr<PackedFunc>(); break;
       case kModuleHandle: delete ptr<Module>(); break;
       case kNodeHandle: delete ptr<std::shared_ptr<Node> >(); break;
+      case kNDArrayContainer: {
+        static_cast<NDArray::Container*>(value_.v_handle)->DecRef();
+        break;
+      }
     }
     if (type_code_ > kExtBegin) {
 #if TVM_RUNTIME_HEADER_ONLY
@@ -635,6 +666,7 @@ inline const char* TypeCode2Str(int type_code) {
     case kTVMContext: return "TVMContext";
     case kFuncHandle: return "FunctionHandle";
     case kModuleHandle: return "ModuleHandle";
+    case kNDArrayContainer: return "NDArrayContainer";
     default: LOG(FATAL) << "unknown type_code="
                         << static_cast<int>(type_code); return "";
   }
@@ -776,7 +808,7 @@ class TVMArgsSetter {
     values_[i].v_handle = value;
     type_codes_[i] = kHandle;
   }
-  void operator()(size_t i, TVMArray* value) const {
+  void operator()(size_t i, DLTensor* value) const {
     values_[i].v_handle = value;
     type_codes_[i] = kArrayHandle;
   }
@@ -811,6 +843,10 @@ class TVMArgsSetter {
     values_[i].v_handle = const_cast<Module*>(&value);
     type_codes_[i] = kModuleHandle;
   }
+  void operator()(size_t i, const NDArray& value) const {  // NOLINT(*)
+    values_[i].v_handle = value.data_;
+    type_codes_[i] = kNDArrayContainer;
+  }
   void operator()(size_t i, const TVMRetValue& value) const {  // NOLINT(*)
     if (value.type_code() == kStr) {
       values_[i].v_str = value.ptr<std::string>()->c_str();
diff --git a/python/tvm/_ffi/_ctypes/function.py b/python/tvm/_ffi/_ctypes/function.py
index 189d9964baf5..602af3ef858b 100644
--- a/python/tvm/_ffi/_ctypes/function.py
+++ b/python/tvm/_ffi/_ctypes/function.py
@@ -94,7 +94,8 @@ def _make_tvm_args(args, temp_args):
             type_codes[i] = TypeCode.NULL
         elif isinstance(arg, NDArrayBase):
             values[i].v_handle = ctypes.cast(arg.handle, ctypes.c_void_p)
-            type_codes[i] = TypeCode.ARRAY_HANDLE
+            type_codes[i] = (TypeCode.NDARRAY_CONTAINER
+                             if not arg.is_view else TypeCode.ARRAY_HANDLE)
         elif isinstance(arg, _nd._TVM_COMPATS):
             values[i].v_handle = ctypes.c_void_p(arg._tvm_handle)
             type_codes[i] = arg.__class__._tvm_tcode
@@ -208,6 +209,7 @@ def _handle_return_func(x):
 C_TO_PY_ARG_SWITCH[TypeCode.MODULE_HANDLE] = _wrap_arg_func(
     _return_module, TypeCode.MODULE_HANDLE)
 C_TO_PY_ARG_SWITCH[TypeCode.ARRAY_HANDLE] = lambda x: _make_array(x.v_handle, True)
+C_TO_PY_ARG_SWITCH[TypeCode.NDARRAY_CONTAINER] = lambda x: _make_array(x.v_handle, False)
 
 _CLASS_MODULE = None
 _CLASS_FUNCTION = None
diff --git a/python/tvm/_ffi/_cython/base.pxi b/python/tvm/_ffi/_cython/base.pxi
index c027c723de08..50a99245f793 100644
--- a/python/tvm/_ffi/_cython/base.pxi
+++ b/python/tvm/_ffi/_cython/base.pxi
@@ -18,6 +18,7 @@ cdef enum TVMTypeCode:
     kFuncHandle = 10
     kStr = 11
     kBytes = 12
+    kNDArrayContainer = 13
     kExtBegin = 15
 
 cdef extern from "tvm/runtime/c_runtime_api.h":
diff --git a/python/tvm/_ffi/_cython/function.pxi b/python/tvm/_ffi/_cython/function.pxi
index 06cda82624b9..989f5b8e7b47 100644
--- a/python/tvm/_ffi/_cython/function.pxi
+++ b/python/tvm/_ffi/_cython/function.pxi
@@ -84,7 +84,8 @@ cdef inline int make_arg(object arg,
         tcode[0] = kNodeHandle
     elif isinstance(arg, NDArrayBase):
         value[0].v_handle = (<NDArrayBase>arg).chandle
-        tcode[0] = kArrayHandle
+        tcode[0] = (kNDArrayContainer if
+                    not (<NDArrayBase>arg).c_is_view else kArrayHandle)
     elif isinstance(arg, _TVM_COMPATS):
         ptr = arg._tvm_handle
         value[0].v_handle = (<void*>ptr)
@@ -173,6 +174,8 @@ cdef inline object make_ret(TVMValue value, int tcode):
         return value.v_int64
     elif tcode == kFloat:
         return value.v_float64
+    elif tcode == kNDArrayContainer:
+        return c_make_array(value.v_handle, False)
     elif tcode == kStr:
         return py_str(value.v_str)
     elif tcode == kBytes:
diff --git a/python/tvm/_ffi/runtime_ctypes.py b/python/tvm/_ffi/runtime_ctypes.py
index 9609f867576b..612b54649d74 100644
--- a/python/tvm/_ffi/runtime_ctypes.py
+++ b/python/tvm/_ffi/runtime_ctypes.py
@@ -25,6 +25,7 @@ class TypeCode(object):
     FUNC_HANDLE = 10
     STR = 11
     BYTES = 12
+    NDARRAY_CONTAINER = 13
     EXT_BEGIN = 15
 
 class TVMByteArray(ctypes.Structure):
diff --git a/src/pass/make_api.cc b/src/pass/make_api.cc
index 6290f63e611d..206bd95010ce 100644
--- a/src/pass/make_api.cc
+++ b/src/pass/make_api.cc
@@ -102,6 +102,7 @@ LoweredFunc MakeAPI(Stmt body,
         msg << name << ": Expect arg[" << i << "] to be pointer";
         seq_check.emplace_back(
             AssertStmt::make(tcode == kHandle ||
+                             tcode == kNDArrayContainer ||
                              tcode == kArrayHandle ||
                              tcode == kNull, msg.str(), nop));
       } else if (t.is_int() || t.is_uint()) {
diff --git a/src/runtime/c_runtime_api.cc b/src/runtime/c_runtime_api.cc
index 5eb39abcc71a..7a7d7ab9f4db 100644
--- a/src/runtime/c_runtime_api.cc
+++ b/src/runtime/c_runtime_api.cc
@@ -124,54 +124,6 @@ void DeviceAPI::SyncStreamFromTo(TVMContext ctx,
                                  TVMStreamHandle event_dst) {
   LOG(FATAL) << "Device does not support stream api.";
 }
-
-inline TVMArray* TVMArrayCreate_() {
-  TVMArray* arr = new TVMArray();
-  arr->shape = nullptr;
-  arr->strides = nullptr;
-  arr->ndim = 0;
-  arr->data = nullptr;
-  return arr;
-}
-
-inline void TVMArrayFree_(TVMArray* arr) {
-  if (arr != nullptr) {
-    // ok to delete nullptr
-    delete[] arr->shape;
-    delete[] arr->strides;
-    if (arr->data != nullptr) {
-      DeviceAPIManager::Get(arr->ctx)->FreeDataSpace(
-          arr->ctx, arr->data);
-    }
-  }
-  delete arr;
-}
-
-inline void VerifyType(int dtype_code, int dtype_bits, int dtype_lanes) {
-  CHECK_GE(dtype_lanes, 1);
-  if (dtype_code == kDLFloat) {
-    CHECK_EQ(dtype_bits % 8, 0);
-  } else {
-    CHECK_EQ(dtype_bits % 8, 0);
-  }
-  CHECK_EQ(dtype_bits & (dtype_bits - 1), 0);
-}
-
-inline size_t GetDataSize(TVMArray* arr) {
-  size_t size = 1;
-  for (tvm_index_t i = 0; i < arr->ndim; ++i) {
-    size *= arr->shape[i];
-  }
-  size *= (arr->dtype.bits * arr->dtype.lanes + 7) / 8;
-  return size;
-}
-
-inline size_t GetDataAlignment(TVMArray* arr) {
-  size_t align = (arr->dtype.bits / 8) * arr->dtype.lanes;
-  if (align < kAllocAlignment) return kAllocAlignment;
-  return align;
-}
-
 }  // namespace runtime
 }  // namespace tvm
 
@@ -370,110 +322,6 @@ int TVMFuncCreateFromCFunc(TVMPackedCFunc func,
   API_END();
 }
 
-int TVMArrayAlloc(const tvm_index_t* shape,
-                  int ndim,
-                  int dtype_code,
-                  int dtype_bits,
-                  int dtype_lanes,
-                  int device_type,
-                  int device_id,
-                  TVMArrayHandle* out) {
-  TVMArray* arr = nullptr;
-  API_BEGIN();
-  // shape
-  arr = TVMArrayCreate_();
-  // ndim
-  arr->ndim = ndim;
-  // dtype
-  VerifyType(dtype_code, dtype_bits, dtype_lanes);
-  arr->dtype.code = static_cast<uint8_t>(dtype_code);
-  arr->dtype.bits = static_cast<uint8_t>(dtype_bits);
-  arr->dtype.lanes = static_cast<uint16_t>(dtype_lanes);
-  if (ndim != 0) {
-    tvm_index_t* shape_copy = new tvm_index_t[ndim];
-    std::copy(shape, shape + ndim, shape_copy);
-    arr->shape = shape_copy;
-  } else {
-    arr->shape = nullptr;
-  }
-  // ctx
-  arr->ctx.device_type = static_cast<DLDeviceType>(device_type);
-  arr->ctx.device_id = device_id;
-  size_t size = GetDataSize(arr);
-  size_t alignment = GetDataAlignment(arr);
-  arr->data = DeviceAPIManager::Get(arr->ctx)->AllocDataSpace(
-      arr->ctx, size, alignment, arr->dtype);
-  *out = arr;
-  API_END_HANDLE_ERROR(TVMArrayFree_(arr));
-}
-
-int TVMArrayFree(TVMArrayHandle handle) {
-  API_BEGIN();
-  TVMArray* arr = handle;
-  TVMArrayFree_(arr);
-  API_END();
-}
-
-int TVMArrayCopyFromTo(TVMArrayHandle from,
-                       TVMArrayHandle to,
-                       TVMStreamHandle stream) {
-  API_BEGIN();
-  size_t from_size = GetDataSize(from);
-  size_t to_size = GetDataSize(to);
-  CHECK_EQ(from_size, to_size)
-    << "TVMArrayCopyFromTo: The size must exactly match";
-
-  CHECK(from->ctx.device_type == to->ctx.device_type
-        || from->ctx.device_type == kDLCPU
-        || to->ctx.device_type == kDLCPU)
-    << "Can not copy across different ctx types directly";
-
-  // Use the context that is *not* a cpu context to get the correct device
-  // api manager.
-  TVMContext ctx = from->ctx.device_type != kDLCPU ? from->ctx : to->ctx;
-
-  DeviceAPIManager::Get(ctx)->CopyDataFromTo(
-    from->data, static_cast<size_t>(from->byte_offset),
-    to->data, static_cast<size_t>(to->byte_offset),
-    from_size, from->ctx, to->ctx, from->dtype, stream);
-
-  API_END();
-}
-
-int TVMArrayCopyFromBytes(TVMArrayHandle handle,
-                          void* data,
-                          size_t nbytes) {
-  API_BEGIN();
-  TVMContext cpu_ctx;
-  cpu_ctx.device_type = kDLCPU;
-  cpu_ctx.device_id = 0;
-  size_t arr_size = GetDataSize(handle);
-  CHECK_EQ(arr_size, nbytes)
-      << "TVMArrayCopyFromBytes: size mismatch";
-  DeviceAPIManager::Get(handle->ctx)->CopyDataFromTo(
-      data, 0,
-      handle->data, static_cast<size_t>(handle->byte_offset),
-      nbytes, cpu_ctx, handle->ctx, handle->dtype, nullptr);
-  API_END();
-}
-
-int TVMArrayCopyToBytes(TVMArrayHandle handle,
-                        void* data,
-                        size_t nbytes) {
-  API_BEGIN();
-  TVMContext cpu_ctx;
-  cpu_ctx.device_type = kDLCPU;
-  cpu_ctx.device_id = 0;
-  size_t arr_size = GetDataSize(handle);
-  CHECK_EQ(arr_size, nbytes)
-      << "TVMArrayCopyToBytes: size mismatch";
-  DeviceAPIManager::Get(handle->ctx)->CopyDataFromTo(
-      handle->data, static_cast<size_t>(handle->byte_offset),
-      data, 0,
-      nbytes, handle->ctx, cpu_ctx, handle->dtype, nullptr);
-  API_END();
-}
-
 int TVMStreamCreate(int device_type, int device_id, TVMStreamHandle* out) {
   API_BEGIN();
   TVMContext ctx;
diff --git a/src/runtime/ndarray.cc b/src/runtime/ndarray.cc
new file mode 100644
index 000000000000..f862f32f6e99
--- /dev/null
+++ b/src/runtime/ndarray.cc
@@ -0,0 +1,248 @@
+/*!
+ *  Copyright (c) 2017 by Contributors
+ * \file ndarray.cc
+ * \brief NDArray container infratructure.
+ */
+#include <dmlc/logging.h>
+#include <tvm/runtime/ndarray.h>
+#include <tvm/runtime/c_runtime_api.h>
+#include <tvm/runtime/device_api.h>
+#include "./runtime_base.h"
+
+// deleter for arrays used by DLPack exporter
+extern "C" void NDArrayDLPackDeleter(DLManagedTensor* tensor);
+
+namespace tvm {
+namespace runtime {
+
+inline void VerifyDataType(DLDataType dtype) {
+  CHECK_GE(dtype.lanes, 1);
+  if (dtype.code == kDLFloat) {
+    CHECK_EQ(dtype.bits % 8, 0);
+  } else {
+    CHECK_EQ(dtype.bits % 8, 0);
+  }
+  CHECK_EQ(dtype.bits & (dtype.bits - 1), 0);
+}
+
+inline size_t GetDataSize(const DLTensor& arr) {
+  size_t size = 1;
+  for (tvm_index_t i = 0; i < arr.ndim; ++i) {
+    size *= arr.shape[i];
+  }
+  size *= (arr.dtype.bits * arr.dtype.lanes + 7) / 8;
+  return size;
+}
+
+inline size_t GetDataAlignment(const DLTensor& arr) {
+  size_t align = (arr.dtype.bits / 8) * arr.dtype.lanes;
+  if (align < kAllocAlignment) return kAllocAlignment;
+  return align;
+}
+
+struct NDArray::Internal {
+  // Default deleter for the container
+  static void DefaultDeleter(NDArray::Container* ptr) {
+    using tvm::runtime::NDArray;
+    if (ptr->manager_ctx != nullptr) {
+      static_cast<NDArray::Container*>(ptr->manager_ctx)->DecRef();
+    } else if (ptr->dl_tensor.data != nullptr) {
+      tvm::runtime::DeviceAPI::Get(ptr->dl_tensor.ctx)->FreeDataSpace(
+          ptr->dl_tensor.ctx, ptr->dl_tensor.data);
+    }
+    delete ptr;
+  }
+  // Deleter for NDArray converted from DLPack
+  // This is used from data which is passed from external DLPack(DLManagedTensor)
+  // that are not allocated inside of TVM.
+  // This enables us to create NDArray from memory allocated by other
+  // frameworks that are DLPack compatible
+  static void DLPackDeleter(NDArray::Container* ptr) {
+    DLManagedTensor* tensor = static_cast<DLManagedTensor*>(ptr->manager_ctx);
+    if (tensor->deleter != nullptr) {
+      (*tensor->deleter)(tensor);
+    }
+    delete ptr;
+  }
+  // Local create function which allocates tensor metadata
+  // but does not allocate space for the data.
+  static NDArray Create(std::vector<int64_t> shape,
+                        DLDataType dtype,
+                        DLContext ctx) {
+    VerifyDataType(dtype);
+    // critical zone
+    NDArray::Container* data = new NDArray::Container();
+    data->deleter = DefaultDeleter;
+    NDArray ret(data);
+    ret.data_ = data;
+    // RAII now in effect
+    // setup shape
+    data->shape_ = std::move(shape);
+    data->dl_tensor.shape = dmlc::BeginPtr(data->shape_);
+    data->dl_tensor.ndim = static_cast<int>(data->shape_.size());
+    // setup dtype
+    data->dl_tensor.dtype = dtype;
+    // setup ctx
+    data->dl_tensor.ctx = ctx;
+    return ret;
+  }
+  // Implementation of API function
+  static DLTensor* MoveAsDLTensor(NDArray arr) {
+    DLTensor* tensor = const_cast<DLTensor*>(arr.operator->());
+    CHECK(reinterpret_cast<DLTensor*>(arr.data_) == tensor);
+    arr.data_ = nullptr;
+    return tensor;
+  }
+};
+
+NDArray NDArray::CreateView(std::vector<int64_t> shape,
+                            DLDataType dtype) {
+  CHECK(data_ != nullptr);
+  CHECK(data_->dl_tensor.strides == nullptr)
+      << "Can only create view for compact tensor";
+  NDArray ret = Internal::Create(shape, dtype, data_->dl_tensor.ctx);
+  ret.data_->dl_tensor.byte_offset =
+      this->data_->dl_tensor.byte_offset;
+  size_t curr_size = GetDataSize(this->data_->dl_tensor);
+  size_t view_size = GetDataSize(ret.data_->dl_tensor);
+  CHECK_LE(view_size, curr_size)
+      << "Tries to create a view that has bigger memory than current one";
+  // increase ref count
+  this->data_->IncRef();
+  ret.data_->manager_ctx = this->data_;
+  ret.data_->dl_tensor.data = this->data_->dl_tensor.data;
+  return ret;
+}
+
+DLManagedTensor* NDArray::ToDLPack() const {
+  CHECK(data_ != nullptr);
+  DLManagedTensor* ret = new DLManagedTensor();
+  ret->dl_tensor = data_->dl_tensor;
+  ret->manager_ctx = const_cast<NDArray*>(this);
+  data_->IncRef();
+  ret->deleter = NDArrayDLPackDeleter;
+  return ret;
+}
+
+NDArray NDArray::Empty(std::vector<int64_t> shape,
+                        DLDataType dtype,
+                        DLContext ctx) {
+  NDArray ret = Internal::Create(shape, dtype, ctx);
+  // setup memory content
+  size_t size = GetDataSize(ret.data_->dl_tensor);
+  size_t alignment = GetDataAlignment(ret.data_->dl_tensor);
+  ret.data_->dl_tensor.data =
+      DeviceAPI::Get(ret->ctx)->AllocDataSpace(
+          ret->ctx, size, alignment, ret->dtype);
+  return ret;
+}
+
+NDArray NDArray::FromDLPack(DLManagedTensor* tensor) {
+  NDArray::Container* data = new NDArray::Container();
+  data->deleter = Internal::DLPackDeleter;
+  data->manager_ctx = tensor;
+  data->dl_tensor = tensor->dl_tensor;
+  return NDArray(data);
+}
+
+void NDArray::CopyFromTo(DLTensor* from,
+                         DLTensor* to,
+                         TVMStreamHandle stream) {
+  size_t from_size = GetDataSize(*from);
+  size_t to_size = GetDataSize(*to);
+  CHECK_EQ(from_size, to_size)
+    << "TVMArrayCopyFromTo: The size must exactly match";
+
+  CHECK(from->ctx.device_type == to->ctx.device_type
+        || from->ctx.device_type == kDLCPU
+        || to->ctx.device_type == kDLCPU)
+    << "Can not copy across different ctx types directly";
+
+  // Use the context that is *not* a cpu context to get the correct device
+  // api manager.
+  TVMContext ctx = from->ctx.device_type != kDLCPU ? from->ctx : to->ctx;
+
+  DeviceAPI::Get(ctx)->CopyDataFromTo(
+    from->data, static_cast<size_t>(from->byte_offset),
+    to->data, static_cast<size_t>(to->byte_offset),
+    from_size, from->ctx, to->ctx, from->dtype, stream);
+}
+
+}  // namespace runtime
+}  // namespace tvm
+
+using namespace tvm::runtime;
+
+void NDArrayDLPackDeleter(DLManagedTensor* tensor) {
+  static_cast<NDArray::Container*>(tensor->manager_ctx)->DecRef();
+  delete tensor;
+}
+
+int TVMArrayAlloc(const tvm_index_t* shape,
+                  int ndim,
+                  int dtype_code,
+                  int dtype_bits,
+                  int dtype_lanes,
+                  int device_type,
+                  int device_id,
+                  TVMArrayHandle* out) {
+  API_BEGIN();
+  DLDataType dtype;
+  dtype.code = static_cast<uint8_t>(dtype_code);
+  dtype.bits = static_cast<uint8_t>(dtype_bits);
+  dtype.lanes = static_cast<uint16_t>(dtype_lanes);
+  DLContext ctx;
+  ctx.device_type = static_cast<DLDeviceType>(device_type);
+  ctx.device_id = device_id;
+  *out = NDArray::Internal::MoveAsDLTensor(
+      NDArray::Empty(std::vector<int64_t>(shape, shape + ndim), dtype, ctx));
+  API_END();
+}
+
+int TVMArrayFree(TVMArrayHandle handle) {
+  API_BEGIN();
+  reinterpret_cast<NDArray::Container*>(handle)->DecRef();
+  API_END();
+}
+
+int TVMArrayCopyFromTo(TVMArrayHandle from,
+                       TVMArrayHandle to,
+                       TVMStreamHandle stream) {
+  API_BEGIN();
+  NDArray::CopyFromTo(from, to, stream);
+  API_END();
+}
+
+int TVMArrayCopyFromBytes(TVMArrayHandle handle,
+                          void* data,
+                          size_t nbytes) {
+  API_BEGIN();
+  TVMContext cpu_ctx;
+  cpu_ctx.device_type = kDLCPU;
+  cpu_ctx.device_id = 0;
+  size_t arr_size = GetDataSize(*handle);
+  CHECK_EQ(arr_size, nbytes)
+      << "TVMArrayCopyFromBytes: size mismatch";
+  DeviceAPI::Get(handle->ctx)->CopyDataFromTo(
+      data, 0,
+      handle->data, static_cast<size_t>(handle->byte_offset),
+      nbytes, cpu_ctx, handle->ctx, handle->dtype, nullptr);
+  API_END();
+}
+
+int TVMArrayCopyToBytes(TVMArrayHandle handle,
+                        void* data,
+                        size_t nbytes) {
+  API_BEGIN();
+  TVMContext cpu_ctx;
+  cpu_ctx.device_type = kDLCPU;
+  cpu_ctx.device_id = 0;
+  size_t arr_size = GetDataSize(*handle);
+  CHECK_EQ(arr_size, nbytes)
+      << "TVMArrayCopyToBytes: size mismatch";
+  DeviceAPI::Get(handle->ctx)->CopyDataFromTo(
+      handle->data, static_cast<size_t>(handle->byte_offset),
+      data, 0,
+      nbytes, handle->ctx, cpu_ctx, handle->dtype, nullptr);
+  API_END();
+}
diff --git a/src/runtime/rpc/rpc_session.cc b/src/runtime/rpc/rpc_session.cc
index 2f181e7edf9a..21fff7b29882 100644
--- a/src/runtime/rpc/rpc_session.cc
+++ b/src/runtime/rpc/rpc_session.cc
@@ -175,7 +175,12 @@ class RPCSession::EventHandler : public dmlc::Stream {
   // send Packed sequence to writer.
   void SendPackedSeq(const TVMValue* arg_values, const int* type_codes, int n) {
     this->Write(n);
-    this->WriteArray(type_codes, n);
+    // only handles .
+    for (int i = 0; i < n; ++i) {
+      int tcode = type_codes[i];
+      if (tcode == kNDArrayContainer) tcode = kArrayHandle;
+      this->Write(tcode);
+    }
     // Argument packing.
     for (int i = 0; i < n; ++i) {
       int tcode = type_codes[i];
@@ -207,6 +212,7 @@ class RPCSession::EventHandler : public dmlc::Stream {
           this->Write(handle);
           break;
         }
+        case kNDArrayContainer:
         case kArrayHandle: {
           DLTensor* arr = static_cast<DLTensor*>(value.v_handle);
           TVMContext ctx = StripSessMask(arr->ctx);
diff --git a/tests/cpp/packed_func_test.cc b/tests/cpp/packed_func_test.cc
index 00e428f258a9..9b2f1df73731 100644
--- a/tests/cpp/packed_func_test.cc
+++ b/tests/cpp/packed_func_test.cc
@@ -38,6 +38,31 @@ TEST(PackedFunc, Node) {
   CHECK(t.same_as(x));
 }
 
+TEST(PackedFunc, NDArray) {
+  using namespace tvm;
+  using namespace tvm::runtime;
+  auto x = NDArray::Empty(
+      {}, String2TVMType("float32"),
+      TVMContext{kDLCPU, 0});
+  reinterpret_cast<float*>(x->data)[0] = 10.0f;
+  CHECK(x.use_count() == 1);
+
+  PackedFunc forward([&](TVMArgs args, TVMRetValue* rv) {
+      *rv = args[0];
+    });
+
+  NDArray ret = PackedFunc([&](TVMArgs args, TVMRetValue* rv) {
+      NDArray y = args[0];
+      DLTensor* ptr = args[0];
+      CHECK(ptr == x.operator->());
+      CHECK(x.same_as(y));
+      CHECK(x.use_count() == 2);
+      *rv = forward(y);
+    })(x);
+  CHECK(ret.use_count() == 2);
+  CHECK(ret.same_as(x));
+}
+
 TEST(PackedFunc, str) {
   using namespace tvm;
   using namespace tvm::runtime;