From c0690496afeab214f46d7b6763a804cf0bf1964e Mon Sep 17 00:00:00 2001
From: Manupa Karunaratne <manupa.karunaratne@arm.com>
Date: Fri, 7 May 2021 22:54:40 +0100
Subject: [PATCH] Improved MLF to contain workspace info (#7938)

* Improved MLF to contain workspace info

Added functionality to calculate workspace, io and constant
memory required by each primfunc and main function. Moreover,
the workspace information required by each primfunc and main
is reported in metadata.json in the Model Library Format(MLF).
- added functionality to record tir and relay primfuncs
- added tests for model_library_format changes

Change-Id: Ib4a8b787345aa35f8a1645e8a648fad84de37bce

* Improved MLF to contain workspace info

* disable AoT for now
* addressing comments

Change-Id: I5f041ec461b02dac6ea9c96ea50eb400d55eef53

* Improved MLF to contain workspace info

* addressed comments
* added aot executor support

Change-Id: I9b54a7939d8ccb3c6ce0454f0fe62866ac66eb5c

* Improved MLF to contain workspace info

* removed redundant utils.py

Change-Id: I256dd88fab31a595bf9509bd1c4ab59b0c145b1e

* Improved MLF to contain workspace info

* removed redundant ffi api

Change-Id: I9ad6795aa839edfdfd05b902d4531fb0a20e894d
---
 python/tvm/micro/model_library_format.py      |  95 ++++++++-
 python/tvm/relay/backend/executor_factory.py  |  12 +-
 python/tvm/relay/build_module.py              |  12 +-
 src/relay/backend/aot_executor_codegen.cc     |  87 +++++++-
 src/relay/backend/build_module.cc             |   8 +
 src/relay/backend/graph_executor_codegen.cc   | 187 ++++++++++++++++--
 src/relay/backend/utils.cc                    |  59 ++++++
 src/relay/backend/utils.h                     |  32 +++
 src/tir/analysis/calculate_workspace.cc       |   7 +-
 .../test_micro_model_library_format.py        | 131 ++++++++++--
 10 files changed, 587 insertions(+), 43 deletions(-)
 create mode 100644 src/relay/backend/utils.cc

diff --git a/python/tvm/micro/model_library_format.py b/python/tvm/micro/model_library_format.py
index 4fd85ea38d98..be991e22a0f8 100644
--- a/python/tvm/micro/model_library_format.py
+++ b/python/tvm/micro/model_library_format.py
@@ -27,6 +27,9 @@
 from ..relay.backend import executor_factory
 from ..relay import param_dict
 
+# This should be kept identical to runtime::symbol::tvm_module_main
+MAIN_FUNC_NAME_STR = "__tvm_main__"
+
 
 class UnsupportedInModelLibraryFormatError(Exception):
     """Raised when export_model_library_format does not support the given Module tree."""
@@ -73,8 +76,16 @@ def _populate_codegen_dir(mod, codegen_dir: str):
         dso_mod.save(file_name)
 
 
-def _build_memory_map(graph_json):
-    """Build a simpler memory map from graph JSON.
+def _build_memory_map(mod):
+    ret = dict()
+    if isinstance(mod, executor_factory.GraphExecutorFactoryModule):
+        ret["sids"] = _build_sid_map(mod.graph_json)
+    ret["functions"] = _build_function_memory_map(mod.function_metadata)
+    return ret
+
+
+def _build_sid_map(graph_json):
+    """Build a simpler storage id info map from graph JSON.
 
     Parameters
     ----------
@@ -117,6 +128,81 @@ def _build_memory_map(graph_json):
     return memory_map
 
 
+def _build_function_memory_map(function_metadata):
+    """Build a simple map that shows how much workspace is required to execute
+    each primitive function. The main_func describes how much memory is required
+    to execute the main control code.
+
+    Parameters
+    ----------
+    function_metadata : Map<String, FunctionInfo>
+        This contains all the compiled metadata on a function basis
+
+    Returns
+    -------
+    dict :
+        This will have two entries:
+        1.) A list with one entry per function describing local memory it is using.
+        2.) A global memory requirement if all functions are executed sequentially
+    """
+    device_max_workspace = dict()
+    main_func_metadata = function_metadata[MAIN_FUNC_NAME_STR]
+    num_targets = len(main_func_metadata.workspace_sizes.items())
+    func_entries = []
+    target_local_entries = dict()
+    for i in range(num_targets):
+        target = main_func_metadata.workspace_sizes.items()[i][0]
+        device_max_workspace[target] = 0
+        for func_name, finfo in function_metadata.items():
+            if func_name == MAIN_FUNC_NAME_STR:
+                continue
+            target_local_entries[func_name] = list()
+
+        for func_name, finfo in function_metadata.items():
+            if func_name == MAIN_FUNC_NAME_STR:
+                continue
+            assert len(finfo.constant_sizes.items()) == num_targets
+            assert len(finfo.io_sizes.items()) == num_targets
+            target = finfo.workspace_sizes.items()[i][0]
+            workspace_size = finfo.workspace_sizes.items()[i][1]
+            target_entry = {
+                "device": int(target.kind.device_type),
+                "workspace_size_bytes": int(workspace_size),
+            }
+            target_local_entries[func_name].append(target_entry)
+            if workspace_size > device_max_workspace[target]:
+                device_max_workspace[target] = workspace_size
+
+    for func_name, target_entries_ in target_local_entries.items():
+        func_entry = {
+            "function_name": str(func_name),
+            "workspace": target_entries_,
+        }
+        func_entries.append(func_entry)
+
+    target_main_entries = list()
+    for i in range(num_targets):
+        target = main_func_metadata.workspace_sizes.items()[i][0]
+        main_func_local_workspace = main_func_metadata.workspace_sizes.items()[i][1]
+        main_func_constants = main_func_metadata.constant_sizes.items()[i][1]
+        main_func_io = main_func_metadata.io_sizes.items()[i][1]
+        target_main_entries.append(
+            {
+                "device": int(target.kind.device_type),
+                "workspace_size_bytes": int(device_max_workspace[target])
+                + int(main_func_local_workspace),
+                "constants_size_bytes": int(main_func_constants),
+                "io_size_bytes": int(main_func_io),
+            }
+        )
+
+    ret = {
+        "operator_functions": func_entries,
+        "main": target_main_entries,
+    }
+    return ret
+
+
 def export_model_library_format(mod: executor_factory.ExecutorFactoryModule, file_name):
     """Export the build artifact in Model Library Format.
 
@@ -133,14 +219,13 @@ def export_model_library_format(mod: executor_factory.ExecutorFactoryModule, fil
     """
     tempdir = utils.tempdir()
     is_aot = isinstance(mod, executor_factory.AOTExecutorFactoryModule)
-    memory_map = [] if is_aot else _build_memory_map(mod.get_executor_config())
     runtime = ["aot"] if is_aot else ["graph"]
 
     metadata = {
-        "version": 1,
+        "version": 2,
         "model_name": mod.libmod_name,
         "export_datetime": datetime.datetime.now().strftime("%Y-%m-%d %H:%M:%SZ"),
-        "memory": memory_map,
+        "memory": _build_memory_map(mod),
         "target": {int(k): str(v) for k, v in mod.target.items()},
         "runtimes": runtime,
     }
diff --git a/python/tvm/relay/backend/executor_factory.py b/python/tvm/relay/backend/executor_factory.py
index f81d8f9f1c15..4ed76f4b6366 100644
--- a/python/tvm/relay/backend/executor_factory.py
+++ b/python/tvm/relay/backend/executor_factory.py
@@ -81,15 +81,18 @@ class AOTExecutorFactoryModule(ExecutorFactoryModule):
         The name of module
     params : dict of str to NDArray
         The parameters of module
+    function_metadata : Map of String to FunctionInfo
+        This holds a map function names to their information
     """
 
-    def __init__(self, ir_mod, target, libmod, libmod_name, params):
+    def __init__(self, ir_mod, target, libmod, libmod_name, params, function_metadata):
         self.ir_mod = ir_mod
         self.target = target
         self.lib = libmod
         self.libmod_name = libmod_name
         self.params = params
         self.iter_cnt = 0
+        self.function_metadata = function_metadata
 
     def get_params(self):
         return self.params
@@ -118,9 +121,13 @@ class GraphExecutorFactoryModule(ExecutorFactoryModule):
         The name of module
     params : dict of str to NDArray
         The parameters of module
+    function_metadata : Map of String to FunctionInfo
+        This holds a map function names to their information
     """
 
-    def __init__(self, ir_mod, target, graph_json_str, libmod, libmod_name, params):
+    def __init__(
+        self, ir_mod, target, graph_json_str, libmod, libmod_name, params, function_metadata
+    ):
         assert isinstance(graph_json_str, string_types)
         fcreate = get_global_func("tvm.graph_executor_factory.create")
         args = []
@@ -136,6 +143,7 @@ def __init__(self, ir_mod, target, graph_json_str, libmod, libmod_name, params):
         self.libmod_name = libmod_name
         self.params = params
         self.iter_cnt = 0
+        self.function_metadata = function_metadata
 
     def export_library(self, file_name, fcompile=None, addons=None, **kwargs):
         return self.module.export_library(file_name, fcompile, addons, **kwargs)
diff --git a/python/tvm/relay/build_module.py b/python/tvm/relay/build_module.py
index 2d8c8207c930..e134eeeefd09 100644
--- a/python/tvm/relay/build_module.py
+++ b/python/tvm/relay/build_module.py
@@ -83,6 +83,7 @@ def __init__(self):
         self._optimize = self.mod["optimize"]
         self._set_params_func = self.mod["set_params"]
         self._get_params_func = self.mod["get_params"]
+        self._get_function_metadata = self.mod["get_function_metadata"]
 
     def build(self, mod, target=None, target_host=None, params=None, executor="graph"):
         """
@@ -200,6 +201,12 @@ def get_module(self):
         """Return the built module."""
         return self._get_module()
 
+    def get_function_metadata(self):
+        """Return the compiled function metadata.
+        Currently, the metadata contains workspace size required by
+        each PrimFunc"""
+        return self._get_function_metadata()
+
     def get_params(self):
         """Return the updated weights."""
         params = self._get_params_func()
@@ -325,14 +332,15 @@ def build(ir_mod, target=None, target_host=None, params=None, mod_name="default"
         executor_config, runtime_mod, params = bld_mod.build(
             mod=ir_mod, target=target, params=params, executor=executor
         )
+        func_metadata = bld_mod.get_function_metadata()
 
         if executor == "aot":
             executor_factory = _executor_factory.AOTExecutorFactoryModule(
-                ir_mod, target, runtime_mod, mod_name, params
+                ir_mod, target, runtime_mod, mod_name, params, func_metadata
             )
         elif executor == "graph":
             executor_factory = _executor_factory.GraphExecutorFactoryModule(
-                ir_mod, target, executor_config, runtime_mod, mod_name, params
+                ir_mod, target, executor_config, runtime_mod, mod_name, params, func_metadata
             )
         else:
             assert False, "Executor " + executor + " not supported"
diff --git a/src/relay/backend/aot_executor_codegen.cc b/src/relay/backend/aot_executor_codegen.cc
index 1939e05e2075..ef188b9df175 100644
--- a/src/relay/backend/aot_executor_codegen.cc
+++ b/src/relay/backend/aot_executor_codegen.cc
@@ -25,8 +25,11 @@
 #include <tvm/ir/module.h>
 #include <tvm/relay/expr_functor.h>
 #include <tvm/runtime/device_api.h>
+#include <tvm/runtime/object.h>
+#include <tvm/tir/analysis.h>
 #include <tvm/tir/builtin.h>
 #include <tvm/tir/expr.h>
+#include <tvm/tir/function.h>
 #include <tvm/tir/stmt.h>
 
 #include <algorithm>
@@ -270,6 +273,79 @@ class AOTExecutorCodegen : public ExprVisitor {
     return ss.str();
   }
 
+  /*!
+   * \brief Update the "main" control function's metadata
+   *
+   * \param func The main function that contains calls to operator tir primitive functions
+   */
+  void UpdateMainWorkspaceSize(const tir::PrimFunc& primfunc, const relay::Function& func) {
+    Integer workspace_size = CalculateWorkspaceBytes(primfunc);
+    // Populate FunctionInfo
+    auto fi_node = make_object<FunctionInfoNode>();
+    // Initialize all target workspaces to zero
+    for (const auto& kv : targets_) {
+      auto tgt = kv.second;
+      fi_node->workspace_sizes.Set(tgt, 0);
+    }
+    fi_node->workspace_sizes.Set(target_host_, workspace_size);
+    fi_node->relay_primfuncs.Set(target_host_, func);
+
+    int64_t io_size = 0;
+    for (const auto& input : input_vars_) {
+      io_size += CalculateRelayExprSizeBytes(input->checked_type());
+    }
+    io_size += CalculateRelayExprSizeBytes(func->body->checked_type());
+    fi_node->io_sizes.Set(target_host_, io_size);
+
+    int64_t const_size = 0;
+    for (const auto& kv : params_by_expr_) {
+      const_size += CalculateRelayExprSizeBytes(kv.first->checked_type());
+    }
+    fi_node->constant_sizes.Set(target_host_, const_size);
+    function_metadata_.Set(String(runtime::symbol::tvm_module_main), FunctionInfo(fi_node));
+  }
+
+  /*!
+   * \brief Update the function metadata for a given cached function and its relay
+   * primitive function.
+   *
+   * \param cfunc The cached function as provided the by the compile engine
+   * \param relay_func The source relay primitive function
+   * \param relay_target The target associated with relay primitive function
+   */
+  void UpdateFunctionMetadata(const CachedFunc& cfunc, const Function& relay_func,
+                              const Target& relay_target) {
+    auto fi_node = make_object<FunctionInfoNode>();
+    for (const auto& kv : cfunc->funcs->functions) {
+      auto primfunc = Downcast<tir::PrimFunc>(kv.second);
+      Integer workspace_size = CalculateWorkspaceBytes(primfunc);
+      Target primfunc_target = relay_target;
+      if (primfunc->attrs->dict.count("target")) {
+        primfunc_target = Downcast<Target>(primfunc->attrs->dict["target"]);
+      }
+      fi_node->workspace_sizes.Set(primfunc_target, workspace_size);
+      // Calculating size for I/O
+      for (auto const& param : primfunc->params) {
+        auto p_shape = primfunc->buffer_map[param]->shape;
+        int num_of_elements = 1;
+        for (const auto& dim_index_expr : p_shape) {
+          if (dim_index_expr->IsInstance<IntImmNode>()) {
+            num_of_elements *= dim_index_expr.as<IntImmNode>()->value;
+          } else {
+            // If shape is dynamic, we cannot calculate workspace in compile time.
+            num_of_elements = 0;
+          }
+        }
+        int element_size = primfunc->buffer_map[param]->dtype.bytes();
+        fi_node->io_sizes.Set(primfunc_target, element_size * num_of_elements);
+      }
+      fi_node->constant_sizes.Set(primfunc_target, 0);
+      fi_node->tir_primfuncs.Set(primfunc_target, primfunc);
+      fi_node->relay_primfuncs.Set(primfunc_target, relay_func);
+    }
+    function_metadata_.Set(cfunc->func_name, FunctionInfo(fi_node));
+  }
+
   void VisitExpr_(const CallNode* op) override {
     // Descend the call tree
     for (auto arg : op->args) {
@@ -336,6 +412,8 @@ class AOTExecutorCodegen : public ExprVisitor {
       lowered_funcs_[target->str()] = IRModule(Map<GlobalVar, BaseFunc>({}));
     }
     lowered_funcs_[target->str()]->Update(lowered_func->funcs);
+    // Update function metadata via looking at all primfuncs
+    UpdateFunctionMetadata(lowered_func, func, target);
 
     // Generate the TIR function call
     CreateFuncCall(GetRef<Call>(op), lowered_func->func_name);
@@ -488,6 +566,8 @@ class AOTExecutorCodegen : public ExprVisitor {
   std::unordered_map<int, te::Var> sids_table_;
   /*! \brief lowered funcs */
   std::unordered_map<std::string, IRModule> lowered_funcs_;
+  /*! \brief lowered funcs */
+  Map<String, FunctionInfo> function_metadata_;
   /*! \brief compile engine */
   CompileEngine compile_engine_;
   /*! \brief the set of statements that make the program */
@@ -531,6 +611,7 @@ class AOTExecutorCodegen : public ExprVisitor {
     VisitExpr(func->body);
 
     auto prim_func = CreateMainFunc(func->params.size());
+    UpdateMainWorkspaceSize(prim_func, func);
     LoweredOutput ret;
 
     ret.params = std::unordered_map<std::string, std::pair<int, const tvm::runtime::NDArray>>();
@@ -559,7 +640,7 @@ class AOTExecutorCodegen : public ExprVisitor {
       symbol_map.Set(GlobalVar(::tvm::runtime::symbol::tvm_run_func_prefix), prim_func);
       ret.lowered_funcs.Set(target_host_str, IRModule(symbol_map));
     }
-
+    ret.function_metadata = std::move(function_metadata_);
     ret.metadata =
         runtime::Metadata(input_vars_.size(), return_sid_.size(), runtime::kTvmExecutorAot);
     return ret;
@@ -602,6 +683,10 @@ class AOTExecutorCodegenModule : public runtime::ModuleNode {
     } else if (name == "get_external_modules") {
       return PackedFunc(
           [sptr_to_self, this](TVMArgs args, TVMRetValue* rv) { *rv = get_external_modules(); });
+    } else if (name == "get_function_metadata") {
+      return PackedFunc([sptr_to_self, this](TVMArgs args, TVMRetValue* rv) {
+        *rv = this->output_.function_metadata;
+      });
     } else if (name == "get_metadata") {
       return PackedFunc(
           [sptr_to_self, this](TVMArgs args, TVMRetValue* rv) { *rv = output_.metadata; });
diff --git a/src/relay/backend/build_module.cc b/src/relay/backend/build_module.cc
index 88faff22cd31..880407f14b8e 100644
--- a/src/relay/backend/build_module.cc
+++ b/src/relay/backend/build_module.cc
@@ -62,6 +62,10 @@ struct ExecutorCodegen {
 
   virtual void UpdateOutput(BuildOutput* ret) = 0;
 
+  Map<String, FunctionInfo> GetFunctionMetadata() {
+    return CallFunc<Map<String, FunctionInfo>>("get_function_metadata", nullptr);
+  }
+
   std::unordered_map<std::string, tvm::runtime::NDArray> GetParams() {
     std::unordered_map<std::string, tvm::runtime::NDArray> ret;
     auto names = CallFunc<Array<runtime::String>>("list_params_name", nullptr);
@@ -197,6 +201,10 @@ class RelayBuildModule : public runtime::ModuleNode {
       return PackedFunc([sptr_to_self, this](TVMArgs args, TVMRetValue* rv) {
         *rv = this->executor_codegen_->GetExternalModules();
       });
+    } else if (name == "get_function_metadata") {
+      return PackedFunc([sptr_to_self, this](TVMArgs args, TVMRetValue* rv) {
+        *rv = this->executor_codegen_->GetFunctionMetadata();
+      });
     } else if (name == "optimize") {
       return PackedFunc([sptr_to_self, this](TVMArgs args, TVMRetValue* rv) {
         ICHECK_EQ(args.num_args, 2);
diff --git a/src/relay/backend/graph_executor_codegen.cc b/src/relay/backend/graph_executor_codegen.cc
index 6d3b93e08e61..ddcdeaac5d61 100644
--- a/src/relay/backend/graph_executor_codegen.cc
+++ b/src/relay/backend/graph_executor_codegen.cc
@@ -28,6 +28,8 @@
 #include <tvm/relay/expr_functor.h>
 #include <tvm/runtime/device_api.h>
 #include <tvm/runtime/object.h>
+#include <tvm/tir/analysis.h>
+#include <tvm/tir/function.h>
 
 #include <list>
 #include <string>
@@ -182,9 +184,96 @@ class GraphExecutorCodegen : public backend::MemoizedExprTranslator<std::vector<
     targets_ = targets;
   }
 
+  /*!
+   * \brief Update the "main" control function's metadata
+   *
+   * \param func The main function that contains calls to relay primitive functions
+   */
+  void UpdateMainWorkspaceSize(const Function& func) {
+    // This is a Map<device,Map<storage_id, size>>
+    std::unordered_map<int, std::unordered_map<int, int>> sid_workspace;
+    // This is a Map<device, size_of_inputs_and_outputs>
+    std::unordered_map<int, int> device_io;
+    // This is a Map<device, size_of_constants>
+    std::unordered_map<int, int> device_consts;
+
+    // Initialize the maps to zero
+    for (const auto& kv : storage_device_map_) {
+      auto sids = kv.second[0];
+      auto devices = kv.second[1];
+      CHECK_EQ(sids.size(), devices.size());
+      for (uint32_t i = 0; i < sids.size(); i++) {
+        sid_workspace[devices[i]][sids[i]] = 0;
+        device_io[devices[i]] = 0;
+        device_consts[devices[i]] = 0;
+      }
+    }
+
+    // Collect sizes of tensors
+    for (const auto& kv : storage_device_map_) {
+      auto size_bytes = CalculateRelayExprSizeBytes(kv.first->checked_type());
+      auto sids = kv.second[0];
+      auto devices = kv.second[1];
+      if (kv.first->IsInstance<ConstantNode>()) {
+        for (const auto& dev : devices) {
+          device_consts[dev] += size_bytes;
+        }
+        continue;
+      } else if (kv.first->IsInstance<VarNode>() || kv.first == func->body) {
+        for (const auto& dev : devices) {
+          device_io[dev] += size_bytes;
+        }
+        continue;
+      }
+      for (uint32_t i = 0; i < sids.size(); i++) {
+        // Here we record the largest size of the tensor
+        // that share the same storage id, because storage_id will
+        // be shared between multiple tensors that are not live simultaneously.
+        if (size_bytes > sid_workspace[devices[i]][sids[i]]) {
+          sid_workspace[devices[i]][sids[i]] = size_bytes;
+        }
+      }
+    }
+
+    // This is a Map<device, workspace_size>
+    std::unordered_map<int, int> device_workspace;
+    // Once we know the sizes of sids, we need to accumulate per device
+    for (const auto& dev_sid_size : sid_workspace) {
+      auto dev = dev_sid_size.first;
+      device_workspace[dev] = 0;
+      for (const auto& sid_size : dev_sid_size.second) {
+        device_workspace[dev] += sid_size.second;
+      }
+    }
+
+    // Populate FunctionInfo
+    auto fi_node = make_object<FunctionInfoNode>();
+    // Initialize all target workspaces to zero
+    for (const auto& kv : targets_) {
+      auto tgt = kv.second;
+      fi_node->workspace_sizes.Set(tgt, 0);
+    }
+    for (const auto& dev_and_size : device_workspace) {
+      auto tgt = GetTargetFromInteger(dev_and_size.first);
+      fi_node->workspace_sizes.Set(tgt, dev_and_size.second);
+      fi_node->relay_primfuncs.Set(tgt, func);
+    }
+    for (const auto& dev_and_size : device_io) {
+      auto tgt = GetTargetFromInteger(dev_and_size.first);
+      fi_node->io_sizes.Set(tgt, dev_and_size.second);
+    }
+    for (const auto& dev_and_size : device_consts) {
+      auto tgt = GetTargetFromInteger(dev_and_size.first);
+      fi_node->constant_sizes.Set(tgt, dev_and_size.second);
+    }
+
+    function_metadata_.Set(String(runtime::symbol::tvm_module_main), FunctionInfo(fi_node));
+  }
+
   LoweredOutput Codegen(relay::Function func) {
     auto pf = GetPackedFunc("relay.backend.GraphPlanMemory");
     storage_device_map_ = (*pf)(func);
+    UpdateMainWorkspaceSize(func);
     // First we convert all the parameters into input nodes.
     for (auto param : func->params) {
       auto node_ptr = GraphInputNode::make_node_ptr(param->name_hint(), GraphAttrs());
@@ -212,6 +301,7 @@ class GraphExecutorCodegen : public backend::MemoizedExprTranslator<std::vector<
       ret.lowered_funcs.Set(kv.first, mod);
     }
     ret.external_mods = compile_engine_->LowerExternalFunctions();
+    ret.function_metadata = std::move(function_metadata_);
     return ret;
   }
 
@@ -352,6 +442,75 @@ class GraphExecutorCodegen : public backend::MemoizedExprTranslator<std::vector<
     return lhs_storage_id == rhs_storage_id;
   }
 
+  /*!
+   * \brief Obtain the Target from the device type.
+   * If homogenous compilation, this will return the only target.
+   * If heteregenous compilation, this will select associated using the targets_ Map.
+   *
+   * \param dev_type
+   * \return Target
+   */
+  Target GetTargetFromInteger(int64_t dev_type) {
+    if (targets_.size() == 1) {
+      // homogeneous execution.
+      const auto& it = targets_.begin();
+      return (*it).second;
+    } else {
+      // heterogeneous execution.
+      std::string call_dev_name;
+      if (dev_type == 0) {
+        call_dev_name = "llvm";
+      } else {
+        call_dev_name = runtime::DeviceName(dev_type);
+      }
+      if (targets_.count(dev_type) == 0) {
+        LOG(FATAL) << "No target is provided for device " << call_dev_name;
+      }
+      return targets_[dev_type];
+    }
+  }
+
+  /*!
+   * \brief Update the function metadata for a given cached function and its relay
+   * primitive function.
+   *
+   * \param cfunc The cached function as provided the by the compile engine
+   * \param relay_func The source relay primitive function
+   * \param relay_target The target associated with relay primitive function
+   */
+  void UpdateFunctionMetadata(const CachedFunc& cfunc, const Function& relay_func,
+                              const Target& relay_target) {
+    auto fi_node = make_object<FunctionInfoNode>();
+    for (const auto& kv : cfunc->funcs->functions) {
+      auto primfunc = Downcast<tir::PrimFunc>(kv.second);
+      Integer workspace_size = CalculateWorkspaceBytes(primfunc);
+      Target primfunc_target = relay_target;
+      if (primfunc->attrs->dict.count("target")) {
+        primfunc_target = Downcast<Target>(primfunc->attrs->dict["target"]);
+      }
+      fi_node->workspace_sizes.Set(primfunc_target, workspace_size);
+      // Calculating size for I/O
+      for (auto const& param : primfunc->params) {
+        auto p_shape = primfunc->buffer_map[param]->shape;
+        int num_of_elements = 1;
+        for (const auto& dim_index_expr : p_shape) {
+          if (dim_index_expr->IsInstance<IntImmNode>()) {
+            num_of_elements *= dim_index_expr.as<IntImmNode>()->value;
+          } else {
+            // If shape is dynamic, we cannot calculate workspace in compile time.
+            num_of_elements = 0;
+          }
+        }
+        int element_size = primfunc->buffer_map[param]->dtype.bytes();
+        fi_node->io_sizes.Set(primfunc_target, element_size * num_of_elements);
+      }
+      fi_node->constant_sizes.Set(primfunc_target, 0);
+      fi_node->tir_primfuncs.Set(primfunc_target, primfunc);
+      fi_node->relay_primfuncs.Set(primfunc_target, relay_func);
+    }
+    function_metadata_.Set(cfunc->func_name, FunctionInfo(fi_node));
+  }
+
   std::vector<GraphNodeRef> VisitExpr_(const CallNode* op) override {
     Expr expr = GetRef<Expr>(op);
     Function func;
@@ -408,30 +567,18 @@ class GraphExecutorCodegen : public backend::MemoizedExprTranslator<std::vector<
     ICHECK_GE(storage_device_map_.count(expr), 0);
     auto& device_type = storage_device_map_[expr][1];
     auto call_dev_type = device_type[0]->value;
+    target = GetTargetFromInteger(call_dev_type);
     // Normal Relay Function
-    if (targets_.size() == 1) {
-      // homogeneous execution.
-      const auto& it = targets_.begin();
-      target = (*it).second;
-    } else {
-      // heterogeneous execution.
-      std::string call_dev_name;
-      if (call_dev_type == 0) {
-        call_dev_name = "llvm";
-      } else {
-        call_dev_name = runtime::DeviceName(call_dev_type);
-      }
-      if (targets_.count(call_dev_type) == 0) {
-        LOG(FATAL) << "No target is provided for device " << call_dev_name;
-      }
-      target = targets_[call_dev_type];
-    }
+
     CCacheKey key = (*pf0)(func, target);
     CachedFunc lowered_func = (*pf1)(compile_engine_, key);
     if (!lowered_funcs_.count(target->str())) {
       lowered_funcs_[target->str()] = IRModule(Map<GlobalVar, BaseFunc>({}));
     }
     lowered_funcs_[target->str()]->Update(lowered_func->funcs);
+
+    // Update function metadata via looking at all primfuncs
+    UpdateFunctionMetadata(lowered_func, func, target);
     return GraphAddCallNode(op, _GetUniqueName(lowered_func->func_name), lowered_func->func_name,
                             attrs);
   }
@@ -577,6 +724,8 @@ class GraphExecutorCodegen : public backend::MemoizedExprTranslator<std::vector<
   Map<Expr, Array<IntegerArray>> storage_device_map_;
   /*! \brief lowered funcs */
   std::unordered_map<std::string, IRModule> lowered_funcs_;
+  /*! \brief lowered funcs */
+  Map<String, FunctionInfo> function_metadata_;
   /*! \brief name map */
   std::unordered_map<std::string, size_t> name_map_;
   /*! \brief compile engine */
@@ -643,6 +792,10 @@ class GraphExecutorCodegenModule : public runtime::ModuleNode {
     } else if (name == "get_metadata") {
       return PackedFunc(
           [sptr_to_self, this](TVMArgs args, TVMRetValue* rv) { *rv = this->output_.metadata; });
+    } else if (name == "get_function_metadata") {
+      return PackedFunc([sptr_to_self, this](TVMArgs args, TVMRetValue* rv) {
+        *rv = this->output_.function_metadata;
+      });
     } else {
       return PackedFunc([](TVMArgs args, TVMRetValue* rv) {});
     }
diff --git a/src/relay/backend/utils.cc b/src/relay/backend/utils.cc
new file mode 100644
index 000000000000..be811961e4a1
--- /dev/null
+++ b/src/relay/backend/utils.cc
@@ -0,0 +1,59 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+/*!
+ * \file relay/backend/util.cc
+ * \brief Relay backend utilities.
+ */
+
+#include "utils.h"
+
+namespace tvm {
+namespace relay {
+namespace backend {
+
+int64_t CalculateRelayExprSizeBytes(const Type& expr_type) {
+  if (expr_type->IsInstance<TupleTypeNode>()) {
+    auto tuple_type = Downcast<TupleType>(expr_type);
+    int64_t size = 0;
+    for (const auto& field : tuple_type->fields) {
+      size += CalculateRelayExprSizeBytes(field);
+    }
+    return size;
+  }
+  auto tensor_type = expr_type.as<TensorTypeNode>();
+  auto shape = tensor_type->shape;
+  int num_of_elements = 1;
+  for (const auto& dim_index_expr : shape) {
+    if (dim_index_expr->IsInstance<IntImmNode>()) {
+      num_of_elements *= dim_index_expr.as<IntImmNode>()->value;
+    } else {
+      // If shape is dynamic, we cannot calculate workspace in compile time.
+      num_of_elements = 0;
+    }
+  }
+  auto element_size = tensor_type->dtype.bytes();
+  return element_size * num_of_elements;
+}
+
+TVM_REGISTER_NODE_TYPE(FunctionInfoNode);
+
+}  // namespace backend
+}  // namespace relay
+}  // namespace tvm
diff --git a/src/relay/backend/utils.h b/src/relay/backend/utils.h
index c804768c99af..4f7cbde5b62c 100644
--- a/src/relay/backend/utils.h
+++ b/src/relay/backend/utils.h
@@ -46,6 +46,37 @@ namespace tvm {
 namespace relay {
 namespace backend {
 
+struct FunctionInfoNode : public Object {
+  Map<Target, Integer> workspace_sizes;
+  Map<Target, Integer> io_sizes;
+  Map<Target, Integer> constant_sizes;
+  Map<Target, tir::PrimFunc> tir_primfuncs;
+  Map<Target, Function> relay_primfuncs;
+
+  void VisitAttrs(tvm::AttrVisitor* v) {
+    v->Visit("workspace_sizes", &workspace_sizes);
+    v->Visit("io_sizes", &io_sizes);
+    v->Visit("constant_sizes", &constant_sizes);
+    v->Visit("tir_primfuncs", &tir_primfuncs);
+    v->Visit("relay_primfuncs", &relay_primfuncs);
+  }
+
+  static constexpr const char* _type_key = "relay.backend.FunctionInfo";
+  TVM_DECLARE_FINAL_OBJECT_INFO(FunctionInfoNode, Object);
+};
+
+class FunctionInfo : public ObjectRef {
+ public:
+  TVM_DEFINE_MUTABLE_OBJECT_REF_METHODS(FunctionInfo, ObjectRef, FunctionInfoNode);
+};
+
+/*!
+ * \brief Calculate the storage required to store the type of relay.Expr
+ *
+ * \param func The relay expr for which the storage is calculated
+ */
+int64_t CalculateRelayExprSizeBytes(const Type& expr_type);
+
 /*!
  *  \brief Executor generator artifacts. Those artifacts  are subsequently
  *  used by the relay build process.
@@ -54,6 +85,7 @@ struct LoweredOutput {
   std::string graph_json;
   Map<String, IRModule> lowered_funcs;
   Array<tvm::runtime::Module> external_mods;
+  Map<String, FunctionInfo> function_metadata;
   std::unordered_map<std::string, std::pair<int, const tvm::runtime::NDArray>> params;
   runtime::Metadata metadata;
 };
diff --git a/src/tir/analysis/calculate_workspace.cc b/src/tir/analysis/calculate_workspace.cc
index 8b42efb12ccd..2f5f5e3a671c 100644
--- a/src/tir/analysis/calculate_workspace.cc
+++ b/src/tir/analysis/calculate_workspace.cc
@@ -50,7 +50,12 @@ size_t WorkspaceCalculator::CalculateExtentsSize(const AllocateNode* op) {
   size_t element_size_bytes = op->dtype.bytes();
   size_t num_elements = 1;
   for (const auto& ext : op->extents) {
-    num_elements *= Downcast<IntImm>(ext)->value;
+    if (ext->IsInstance<IntImmNode>()) {
+      num_elements *= Downcast<IntImm>(ext)->value;
+    } else {
+      // We cant statically calculate workspace for dynamic shapes
+      num_elements = 0;
+    }
   }
   return num_elements * element_size_bytes;
 }
diff --git a/tests/python/unittest/test_micro_model_library_format.py b/tests/python/unittest/test_micro_model_library_format.py
index 712bd8d348a2..d2c519da22b5 100644
--- a/tests/python/unittest/test_micro_model_library_format.py
+++ b/tests/python/unittest/test_micro_model_library_format.py
@@ -45,9 +45,16 @@ def validate_graph_json(extract_dir, factory):
 
 
 @tvm.testing.requires_micro
-def test_export_model_library_format_c():
+@pytest.mark.parametrize(
+    "target",
+    [
+        ("graph", tvm.target.target.micro("host")),
+        ("aot", tvm.target.target.micro("host", options="-executor=aot")),
+    ],
+)
+def test_export_model_library_format_c(target):
+    executor, _target = target
     with utils.TempDirectory.set_keep_for_debug(True):
-        target = tvm.target.target.micro("host")
         with tvm.transform.PassContext(opt_level=3, config={"tir.disable_vectorize": True}):
             relay_mod = tvm.parser.fromtext(
                 """
@@ -59,8 +66,8 @@ def @main(%a : Tensor[(1, 2), uint8], %b : Tensor[(1, 2), float32], %c : Tensor[
             )
             factory = tvm.relay.build(
                 relay_mod,
-                target,
-                target_host=target,
+                _target,
+                target_host=_target,
                 mod_name="add",
                 params={"c": numpy.array([[2.0, 4.0]], dtype="float32")},
             )
@@ -78,24 +85,41 @@ def @main(%a : Tensor[(1, 2), uint8], %b : Tensor[(1, 2), float32], %c : Tensor[
 
         with open(os.path.join(extract_dir, "metadata.json")) as json_f:
             metadata = json.load(json_f)
-            assert metadata["version"] == 1
+            assert metadata["version"] == 2
             assert metadata["model_name"] == "add"
             export_datetime = datetime.datetime.strptime(
                 metadata["export_datetime"], "%Y-%m-%d %H:%M:%SZ"
             )
             assert (datetime.datetime.now() - export_datetime) < datetime.timedelta(seconds=60 * 5)
-            assert metadata["target"] == {"1": str(target)}
-            assert metadata["memory"] == [
-                {"storage_id": 0, "size_bytes": 2, "input_binding": "a"},
-                {"storage_id": 1, "size_bytes": 8, "input_binding": "b"},
-                {"storage_id": 2, "size_bytes": 8, "input_binding": "p0"},
-                {"storage_id": 3, "size_bytes": 8},
+            assert metadata["target"] == {"1": str(_target)}
+            if executor == "graph":
+                assert metadata["memory"]["sids"] == [
+                    {"storage_id": 0, "size_bytes": 2, "input_binding": "a"},
+                    {"storage_id": 1, "size_bytes": 8, "input_binding": "b"},
+                    {"storage_id": 2, "size_bytes": 8, "input_binding": "p0"},
+                    {"storage_id": 3, "size_bytes": 8},
+                ]
+            assert metadata["memory"]["functions"]["main"] == [
+                {
+                    "constants_size_bytes": 8,
+                    "device": 1,
+                    "io_size_bytes": 18,
+                    "workspace_size_bytes": 0,
+                }
+            ]
+            assert metadata["memory"]["functions"]["operator_functions"][0]["workspace"] == [
+                {"device": 1, "workspace_size_bytes": 0}
             ]
+            assert (
+                "fused_cast_multiply_add"
+                in metadata["memory"]["functions"]["operator_functions"][0]["function_name"]
+            )
 
         assert os.path.exists(os.path.join(extract_dir, "codegen", "host", "src", "lib0.c"))
         assert os.path.exists(os.path.join(extract_dir, "codegen", "host", "src", "lib1.c"))
 
-        validate_graph_json(extract_dir, factory)
+        if executor == "graph":
+            validate_graph_json(extract_dir, factory)
 
         with open(os.path.join(extract_dir, "relay.txt")) as relay_f:
             assert relay_f.read() == str(relay_mod)
@@ -141,19 +165,34 @@ def @main(%a : Tensor[(1, 2), uint8], %b : Tensor[(1, 2), float32], %c : Tensor[
 
         with open(os.path.join(extract_dir, "metadata.json")) as json_f:
             metadata = json.load(json_f)
-            assert metadata["version"] == 1
+            assert metadata["version"] == 2
             assert metadata["model_name"] == "add"
             export_datetime = datetime.datetime.strptime(
                 metadata["export_datetime"], "%Y-%m-%d %H:%M:%SZ"
             )
             assert (datetime.datetime.now() - export_datetime) < datetime.timedelta(seconds=60 * 5)
             assert metadata["target"] == {"1": str(target)}
-            assert metadata["memory"] == [
+            assert metadata["memory"]["sids"] == [
                 {"storage_id": 0, "size_bytes": 2, "input_binding": "a"},
                 {"storage_id": 1, "size_bytes": 8, "input_binding": "b"},
                 {"storage_id": 2, "size_bytes": 8, "input_binding": "p0"},
                 {"storage_id": 3, "size_bytes": 8},
             ]
+            assert metadata["memory"]["functions"]["main"] == [
+                {
+                    "constants_size_bytes": 8,
+                    "device": 1,
+                    "io_size_bytes": 18,
+                    "workspace_size_bytes": 0,
+                }
+            ]
+            assert metadata["memory"]["functions"]["operator_functions"][0]["workspace"] == [
+                {"device": 1, "workspace_size_bytes": 0}
+            ]
+            assert (
+                "fused_cast_multiply_add"
+                in metadata["memory"]["functions"]["operator_functions"][0]["function_name"]
+            )
 
         assert os.path.exists(os.path.join(extract_dir, "codegen", "host", "lib", "lib0.o"))
 
@@ -167,11 +206,73 @@ def @main(%a : Tensor[(1, 2), uint8], %b : Tensor[(1, 2), float32], %c : Tensor[
             assert "p0" in params
 
 
+@tvm.testing.requires_micro
+@pytest.mark.parametrize(
+    "target",
+    [
+        ("graph", tvm.target.target.micro("host")),
+        ("aot", tvm.target.target.micro("host", options="-executor=aot")),
+    ],
+)
+def test_export_model_library_format_workspace(target):
+    executor, _target = target
+    with tvm.transform.PassContext(opt_level=3, config={"tir.disable_vectorize": True}):
+        relay_mod = tvm.parser.fromtext(
+            """
+            #[version = "0.0.5"]
+            def @main(%p0: Tensor[(1, 56, 56, 128), int16], %p1: Tensor[(3, 3, 128, 1), int16], %p2: Tensor[(1, 1, 1, 128), int32]){
+              %0 = nn.conv2d(%p0, %p1, padding=[1, 1, 1, 1], groups=128, channels=128, kernel_size=[3, 3], data_layout="NHWC", kernel_layout="HWOI", out_dtype="int32") /* ty=Tensor[(1, 56, 56, 128), int32] */;
+              %1 = add(%0, %p2) /* ty=Tensor[(1, 56, 56, 128), int32] */;
+              %2 = fixed_point_multiply(%1, multiplier=2080045879, shift=-4) /* ty=Tensor[(1, 56, 56, 128), int32] */;
+              %3 = clip(%2, a_min=0f, a_max=255f) /* ty=Tensor[(1, 56, 56, 128), int32] */;
+              cast(%3, dtype="uint8") /* ty=Tensor[(1, 56, 56, 128), uint8] */
+            }
+            """
+        )
+        factory = tvm.relay.build(relay_mod, _target, target_host=_target, mod_name="qnn_conv2d")
+
+    temp_dir = utils.tempdir()
+    mlf_tar_path = temp_dir.relpath("lib.tar")
+    import tvm.micro as micro
+
+    micro.export_model_library_format(factory, mlf_tar_path)
+    tf = tarfile.open(mlf_tar_path)
+
+    extract_dir = temp_dir.relpath("extract")
+    os.mkdir(extract_dir)
+    tf.extractall(extract_dir)
+
+    with open(os.path.join(extract_dir, "metadata.json")) as json_f:
+        metadata = json.load(json_f)
+        assert metadata["version"] == 2
+        assert metadata["model_name"] == "qnn_conv2d"
+        export_datetime = datetime.datetime.strptime(
+            metadata["export_datetime"], "%Y-%m-%d %H:%M:%SZ"
+        )
+        assert (datetime.datetime.now() - export_datetime) < datetime.timedelta(seconds=60 * 5)
+        assert metadata["target"] == {"1": str(_target)}
+        assert metadata["memory"]["functions"]["main"] == [
+            {
+                "constants_size_bytes": 0,
+                "device": 1,
+                "io_size_bytes": 1207040,
+                "workspace_size_bytes": 2466816,
+            }
+        ]
+        assert metadata["memory"]["functions"]["operator_functions"][0]["workspace"] == [
+            {"device": 1, "workspace_size_bytes": 2466816}
+        ]
+        assert (
+            "fused_nn_conv2d_add_fixed_point_multiply_clip_cast"
+            in metadata["memory"]["functions"]["operator_functions"][0]["function_name"]
+        )
+
+
 @tvm.testing.requires_micro
 def test_export_model():
     module = tvm.support.FrontendTestModule()
     factory = executor_factory.GraphExecutorFactoryModule(
-        None, tvm.target.target.micro("host"), '"graph_json"', module, "test_module", {}
+        None, tvm.target.target.micro("host"), '"graph_json"', module, "test_module", {}, {}
     )
 
     temp_dir = utils.tempdir()