// Copyright (c) 2017-2019, Apple Inc. All rights reserved. // // Use of this source code is governed by a BSD-3-clause license that can be // found in LICENSE.txt or at https://opensource.org/licenses/BSD-3-Clause /** * A neural network is defined through a collection of layers * and represents a directed acyclic graph (DAG). * Each layer has a name, a layer type, * a list of input names, a list of output names, * and a collection of parameters specific to the layer type. * * The graph structure and connectivity of the neural network * is inferred from the input and output names. * A neural network starts with the layer * whose input name is equal to the value specified in * Model.description.input.name, * and ends with the layer * whose output name is equal to the value specified in * Model.description.output.name. * Layers must have unique input and output names, * and a layer may not have input or output names that * refer to layers that are not yet defined. * * For Core ML specification version <=3, * all inputs are mapped to static rank 5 tensors, with axis notations * [Sequence, Batch, Channel, Height, Width]. * * From specification version 4 onwards (iOS >= 13, macOS >= 10.15), more options are available * (see enums NeuralNetworkMultiArrayShapeMapping, NeuralNetworkImageShapeMapping) * to map inputs to generic N-Dimensional (or N rank) tensors, where N >= 1. * * Each layer type may have specific constraints on the ranks of its inputs and outputs. * * Some of the layers (such as softmax, reduce, etc) have parameters that have been described in * terms of notational axis "Channel", "Height", "Width" or "Sequence". They can be re-interpreted easily in * the general ND setting by using the following rule: * "width" is same as axis = -1 (i.e. the last axis from the end) * "height" is same as axis = -2 (i.e. the second last axis from the end) * "channel" is same as axis = -3 (i.e. the third last axis from the end) * "sequence" is same as axis = -5 (i.e. the fifth last axis from the end) * * Several layers are available in 3 different variations, with the names ending * in identifiers: like, static and dynamic. For instance, FillLike, * FillStatic and FillDynamic. The static variation generally will have * a property corresponding to the shape of the output. For instance, if the * output of the FillStatic layer is desired to be of shape (10, 4), the * property targetShape will have to be set to [10, 4]. In the dynamic case, * the shape is an input, hence it can be changed at runtime. For instance, for * a FillDynamic layer, the input would have to be an array containing the * values 10 and 4, if the desired output is of shape (10, 4). Whereas in the * like case, the additional input's shape is used as the output shape, ignoring * its values. For instance, for a FillLike layer, for an input with shape * (10, 4), the output generated will also be of shape (10, 4), values of the * input will be ignored. */ syntax = "proto3"; option optimize_for = LITE_RUNTIME; import public "DataStructures.proto"; import public "Parameters.proto"; package CoreML.Specification; enum NeuralNetworkMultiArrayShapeMapping { /* * Describes how the MultiArray shape for the inputs, * provided in Features Types proto via model description, * is mapped to construct tensors that are fed into the Neural Network layers. */ /* * Default legacy value. Only supported for Core ML Specification version <= 3. * * The default legacy shape mapping resolves all input shapes to a rank 5 equivalent * with axis notation of [Seq, Batch, Channel, Height, Width]. * * When this enum value is selected, * the repeated shape field in the message "ArrayFeatureType" in feature types proto, * must be either length 1 or length 3. * * The following rule is used to map the values in the shape field to the actual tensor shape: * rank 1 shape is mapped to shape [1,1,C,1,1] * rank 3 shape is mapped to shape [1,1,C,H,W] * At runtime, the first two dimensions (Seq or Batch) can be presented as well, with non-1 values. * * It is invalid to use this enum value if any of the layers added * Specification version 4 (iOS >= 13, macOS >= 10.15) onwards are used in the network. * Validator will raise an error in that case. */ RANK5_ARRAY_MAPPING = 0; /* * The exact shape and rank (i.e. number of dimensions in the shape) of the input, * as specified in the message "ArrayFeatureType", is passed through to the layers. * Supported only for Specification version >= 4 (iOS >= 13, macOS >= 10.15). */ EXACT_ARRAY_MAPPING = 1; } enum NeuralNetworkImageShapeMapping { /* * Describes how the shape of the input tensors is constructed from image inputs. */ /* * In this case, image input is mapped to a rank 5 tensor. * For Color images, input tensor is shaped as [1,1,3,H,W]. * For Gray images, input tensor is shaped as [1,1,1,H,W]. */ RANK5_IMAGE_MAPPING = 0; /* * For Color images, input tensor is shaped as [1,3,H,W]. * For Gray images, input tensor is shaped as [1,1,H,W]. * Supported only for Specification version >= 4 (iOS >= 13, macOS >= 10.15). */ RANK4_IMAGE_MAPPING = 1; } /** A neural network. */ message NeuralNetwork { repeated NeuralNetworkLayer layers = 1; repeated NeuralNetworkPreprocessing preprocessing = 2; // use this enum value to determine the input tensor shapes to the neural network, for multiarray inputs NeuralNetworkMultiArrayShapeMapping arrayInputShapeMapping = 5; // use this enum value to determine the input tensor shapes to the neural network, for image inputs NeuralNetworkImageShapeMapping imageInputShapeMapping = 6; NetworkUpdateParameters updateParams = 10; } /// Preprocessing /// ------------- /** * A neural network preprocessor that * performs a scalar multiplication of an image * followed by addition of scalar biases to the channels. * * Input: X * An image in BGR or RGB format with shape [3, H, W] * or in grayscale format with shape [1, H, W]. * Output: Y * An image with format and shape corresponding to the input. * * If the input image is in BGR format: * * .. code:: * * Y[0, :, :] = channelScale * X[0, :, :] + blueBias * Y[1, :, :] = channelScale * X[1, :, :] + greenBias * Y[2, :, :] = channelScale * X[2, :, :] + redBias * * If the input image is in RGB format: * * .. code:: * * Y[0, :, :] = channelScale * X[0, :, :] + redBias * Y[1, :, :] = channelScale * X[1, :, :] + greenBias * Y[2, :, :] = channelScale * X[2, :, :] + blueBias * * If the input image is in grayscale format: * * .. code:: * * Y[0, :, :] = channelScale * X[0, :, :] + grayBias */ message NeuralNetworkImageScaler { float channelScale = 10; ///Scalar to be multiplied. float blueBias = 20; ///Scalar blue bias to be added. float greenBias = 21; ///Scalar green bias to be added. float redBias = 22; ///Scalar red bias to be added. float grayBias = 30; ///Scalar bias to be added for grayscale images. } /** * A neural network preprocessor that * subtracts the provided mean image from the input image. * The mean image is subtracted from the input named * NeuralNetworkPreprocessing.featureName. */ message NeuralNetworkMeanImage { /** * Mean image stored as a flattened array of floats, * representing shape [Channel,Height,Width]. */ repeated float meanImage = 1; } /// Preprocessing parameters for image inputs. message NeuralNetworkPreprocessing { string featureName = 1; /// must be equal to the input name to which the preprocessing is applied oneof preprocessor { NeuralNetworkImageScaler scaler = 10; NeuralNetworkMeanImage meanImage = 11; } } /// Activation Functions /// -------------------- /** * A rectified linear unit (ReLU) activation function. * * This function has the following formula: * * .. math:: * f(x) = \text{max}(0, x) */ message ActivationReLU { } /** * A leaky rectified linear unit (ReLU) activation function. * * This function has the following formula: * * .. math:: * f(x) = \begin{cases} * x & \text{if } x \geq 0 \\ * \alpha x & \text{if } x < 0 * \end{cases} */ message ActivationLeakyReLU { float alpha = 1; //negative slope value for leakyReLU } /** * A hyperbolic tangent activation function. * * This function has the following formula: * * .. math:: * f(x) = \dfrac{1 - e^{-2x}}{1 + e^{-2x}} */ message ActivationTanh { } /** * A scaled hyperbolic tangent activation function. * * This function has the following formula: * * .. math:: * f(x) = \alpha \tanh(\beta x) */ message ActivationScaledTanh { float alpha = 1; float beta = 2; } /** * A sigmoid activation function. * * This function has the following formula: * * .. math:: * f(x) = \dfrac{1}{1 + e^{-x}} */ message ActivationSigmoid { } /** * A linear activation function. * * This function has the following formula: * * .. math:: * f(x) = \alpha x + \beta */ message ActivationLinear { float alpha = 1; float beta = 2; } /** * A hard sigmoid activation function. * * This function has the following formula: * * .. math:: * f(x) = \text{min}(\text{max}(\alpha x + \beta, 0), 1) */ message ActivationSigmoidHard { float alpha = 1; float beta = 2; } /** * A parameterized rectified linear unit (PReLU) activation function. * Input must be at least rank 3. Axis = -3 is denoted by "C", or channels. * "alpha" parameter can be a vector of length C. * * This function has the following formula: * * .. math:: * f(x_i) = \begin{cases} * x_i & \text{if } x_i \geq 0 \\ * \alpha_i x_i & \text{if } x_i < 0 * \end{cases} \;,\;i=1,...,C */ message ActivationPReLU { // parameter of length C or 1. // If length is 1, same value is used for all channels WeightParams alpha = 1; } /** * An exponential linear unit (ELU) activation function. * * This function has the following formula: * * .. math:: * f(x) = \begin{cases} * x & \text{if } x \geq 0 \\ * \alpha (e^x - 1) & \text{if } x < 0 * \end{cases} */ message ActivationELU { float alpha = 1; } /** * A thresholded rectified linear unit (ReLU) activation function. * * This function has the following formula: * * .. math:: * f(x) = \begin{cases} * x & \text{if } x \geq \alpha \\ * 0 & \text{if } x < \alpha * \end{cases} */ message ActivationThresholdedReLU { float alpha = 1; } /** * A softsign activation function. * * This function has the following formula: * * .. math:: * f(x) = \dfrac{x}{1 + |x|} */ message ActivationSoftsign { } /** * A softplus activation function. * * This function has the following formula: * * .. math:: * f(x) = \text{log}(1 + e^x) */ message ActivationSoftplus { } /** * A parametric softplus activation function. * Input must be at least rank 3. axis = -3 is denoted by "C", or channels. * "alpha"/"beta" parameter can be a vector of length C. * * This function has the following formula: * * .. math:: * f(x_i) = \alpha_i \text{log}(1 + e^{\beta_i x_i}) \;,\;i=1,...,C */ message ActivationParametricSoftplus { // If length is 1, same value is used for all channels WeightParams alpha = 1; //parameter of length C or 1 WeightParams beta = 2; //parameter of length C or 1 } message ActivationParams { oneof NonlinearityType { ActivationLinear linear = 5; ActivationReLU ReLU = 10; ActivationLeakyReLU leakyReLU = 15; ActivationThresholdedReLU thresholdedReLU = 20; ActivationPReLU PReLU = 25; ActivationTanh tanh = 30; ActivationScaledTanh scaledTanh = 31; ActivationSigmoid sigmoid = 40; ActivationSigmoidHard sigmoidHard = 41; ActivationELU ELU = 50; ActivationSoftsign softsign = 60; ActivationSoftplus softplus = 70; ActivationParametricSoftplus parametricSoftplus = 71; } } /** * Representation of the intermediate tensors */ message Tensor { // Number of dimensions in the tensor shape uint32 rank = 1; // actual value of the tensor shape. // must be of length "rank". Can contain -1s for unknown dimensions. repeated int64 dimValue = 2; } /** * A single neural network layer. */ message NeuralNetworkLayer { string name = 1; //descriptive name of the layer repeated string input = 2; repeated string output = 3; repeated Tensor inputTensor = 4; // must be the same length as the "input" field repeated Tensor outputTensor = 5; // must be the same length as the "output" field // Must be set to true to mark the layer as updatable. // If true, the weightParams in the layer's properties must also be set to updatable // If false, the value of the isUpdatable parameter within the layer's weights are ignored bool isUpdatable = 10; oneof layer { // Start at 100 here ConvolutionLayerParams convolution = 100; PoolingLayerParams pooling = 120; ActivationParams activation = 130; InnerProductLayerParams innerProduct = 140; EmbeddingLayerParams embedding = 150; // Normalization-related Layers BatchnormLayerParams batchnorm = 160; MeanVarianceNormalizeLayerParams mvn = 165; L2NormalizeLayerParams l2normalize = 170; SoftmaxLayerParams softmax = 175; LRNLayerParams lrn = 180; CropLayerParams crop = 190; PaddingLayerParams padding = 200; UpsampleLayerParams upsample = 210; ResizeBilinearLayerParams resizeBilinear = 211; CropResizeLayerParams cropResize = 212; UnaryFunctionLayerParams unary = 220; // Element-wise Operations AddLayerParams add = 230; MultiplyLayerParams multiply = 231; AverageLayerParams average = 240; ScaleLayerParams scale = 245; BiasLayerParams bias = 250; MaxLayerParams max = 260; MinLayerParams min = 261; DotProductLayerParams dot = 270; ReduceLayerParams reduce = 280; LoadConstantLayerParams loadConstant = 290; // Data Reorganization ReshapeLayerParams reshape = 300; FlattenLayerParams flatten = 301; PermuteLayerParams permute = 310; ConcatLayerParams concat = 320; SplitLayerParams split = 330; SequenceRepeatLayerParams sequenceRepeat = 340; ReorganizeDataLayerParams reorganizeData = 345; SliceLayerParams slice = 350; // Recurrent Layers SimpleRecurrentLayerParams simpleRecurrent = 400; GRULayerParams gru = 410; UniDirectionalLSTMLayerParams uniDirectionalLSTM = 420; BiDirectionalLSTMLayerParams biDirectionalLSTM = 430; // Custom (user-implemented) Layer CustomLayerParams custom = 500; // Following layers are available only after Core ML Specification // version >= 4 (iOS >= 13, macOS >= 10.15) // Control Flow related Layers CopyLayerParams copy = 600; BranchLayerParams branch = 605; LoopLayerParams loop = 615; LoopBreakLayerParams loopBreak = 620; LoopContinueLayerParams loopContinue = 625; RangeStaticLayerParams rangeStatic = 635; RangeDynamicLayerParams rangeDynamic = 640; // Element-wise Unary Layers ClipLayerParams clip = 660; CeilLayerParams ceil = 665; FloorLayerParams floor = 670; SignLayerParams sign = 680; RoundLayerParams round = 685; Exp2LayerParams exp2 = 700; SinLayerParams sin = 710; CosLayerParams cos = 715; TanLayerParams tan = 720; AsinLayerParams asin = 730; AcosLayerParams acos = 735; AtanLayerParams atan = 740; SinhLayerParams sinh = 750; CoshLayerParams cosh = 755; TanhLayerParams tanh = 760; AsinhLayerParams asinh = 770; AcoshLayerParams acosh = 775; AtanhLayerParams atanh = 780; ErfLayerParams erf = 790; GeluLayerParams gelu = 795; // Element-wise Binary with Broadcasting Support EqualLayerParams equal = 815; NotEqualLayerParams notEqual = 820; LessThanLayerParams lessThan = 825; LessEqualLayerParams lessEqual = 827; GreaterThanLayerParams greaterThan = 830; GreaterEqualLayerParams greaterEqual = 832; LogicalOrLayerParams logicalOr = 840; LogicalXorLayerParams logicalXor = 845; LogicalNotLayerParams logicalNot = 850; LogicalAndLayerParams logicalAnd = 855; ModBroadcastableLayerParams modBroadcastable = 865; MinBroadcastableLayerParams minBroadcastable = 870; MaxBroadcastableLayerParams maxBroadcastable = 875; AddBroadcastableLayerParams addBroadcastable = 880; PowBroadcastableLayerParams powBroadcastable = 885; DivideBroadcastableLayerParams divideBroadcastable = 890; FloorDivBroadcastableLayerParams floorDivBroadcastable = 895; MultiplyBroadcastableLayerParams multiplyBroadcastable = 900; SubtractBroadcastableLayerParams subtractBroadcastable = 905; // Tensor Manipulations TileLayerParams tile = 920; StackLayerParams stack = 925; GatherLayerParams gather = 930; ScatterLayerParams scatter = 935; GatherNDLayerParams gatherND = 940; ScatterNDLayerParams scatterND = 945; SoftmaxNDLayerParams softmaxND = 950; GatherAlongAxisLayerParams gatherAlongAxis = 952; ScatterAlongAxisLayerParams scatterAlongAxis = 954; ReverseLayerParams reverse = 960; ReverseSeqLayerParams reverseSeq = 965; SplitNDLayerParams splitND = 975; ConcatNDLayerParams concatND = 980; TransposeLayerParams transpose = 985; SliceStaticLayerParams sliceStatic = 995; SliceDynamicLayerParams sliceDynamic = 1000; SlidingWindowsLayerParams slidingWindows = 1005; TopKLayerParams topK = 1015; ArgMinLayerParams argMin = 1020; ArgMaxLayerParams argMax = 1025; EmbeddingNDLayerParams embeddingND = 1040; BatchedMatMulLayerParams batchedMatmul = 1045; // Tensor Allocation / Reshape-related Operations GetShapeLayerParams getShape = 1065; LoadConstantNDLayerParams loadConstantND = 1070; FillLikeLayerParams fillLike = 1080; FillStaticLayerParams fillStatic = 1085; FillDynamicLayerParams fillDynamic = 1090; BroadcastToLikeLayerParams broadcastToLike = 1100; BroadcastToStaticLayerParams broadcastToStatic = 1105; BroadcastToDynamicLayerParams broadcastToDynamic = 1110; SqueezeLayerParams squeeze = 1120; ExpandDimsLayerParams expandDims = 1125; FlattenTo2DLayerParams flattenTo2D = 1130; ReshapeLikeLayerParams reshapeLike = 1135; ReshapeStaticLayerParams reshapeStatic = 1140; ReshapeDynamicLayerParams reshapeDynamic = 1145; RankPreservingReshapeLayerParams rankPreservingReshape = 1150; ConstantPaddingLayerParams constantPad = 1155; // Random Distributions RandomNormalLikeLayerParams randomNormalLike = 1170; RandomNormalStaticLayerParams randomNormalStatic = 1175; RandomNormalDynamicLayerParams randomNormalDynamic = 1180; RandomUniformLikeLayerParams randomUniformLike = 1190; RandomUniformStaticLayerParams randomUniformStatic = 1195; RandomUniformDynamicLayerParams randomUniformDynamic = 1200; RandomBernoulliLikeLayerParams randomBernoulliLike = 1210; RandomBernoulliStaticLayerParams randomBernoulliStatic = 1215; RandomBernoulliDynamicLayerParams randomBernoulliDynamic = 1220; CategoricalDistributionLayerParams categoricalDistribution = 1230; // Reduction-related Layers: ReduceL1LayerParams reduceL1 = 1250; ReduceL2LayerParams reduceL2 = 1255; ReduceMaxLayerParams reduceMax = 1260; ReduceMinLayerParams reduceMin = 1265; ReduceSumLayerParams reduceSum = 1270; ReduceProdLayerParams reduceProd = 1275; ReduceMeanLayerParams reduceMean = 1280; ReduceLogSumLayerParams reduceLogSum = 1285; ReduceSumSquareLayerParams reduceSumSquare = 1290; ReduceLogSumExpLayerParams reduceLogSumExp = 1295; // Masking / Selection Layers WhereNonZeroLayerParams whereNonZero = 1313; MatrixBandPartLayerParams matrixBandPart = 1315; LowerTriangularLayerParams lowerTriangular = 1320; UpperTriangularLayerParams upperTriangular = 1325; WhereBroadcastableLayerParams whereBroadcastable = 1330; // Normalization Layers LayerNormalizationLayerParams layerNormalization = 1350; NonMaximumSuppressionLayerParams NonMaximumSuppression = 1400; // Following layers are available only after Core ML Specification // version >= 5 (iOS >= 14, macOS >= 11.0) OneHotLayerParams oneHot = 1450; CumSumLayerParams cumSum = 1455; ClampedReLULayerParams clampedReLU = 1460; ArgSortLayerParams argSort = 1461; Pooling3DLayerParams pooling3d = 1465; GlobalPooling3DLayerParams globalPooling3d = 1466; SliceBySizeLayerParams sliceBySize = 1470; Convolution3DLayerParams convolution3d = 1471; } } /** * Branching Layer * * A layer that provides the functionality of branching or an If-Else block. * * Must have 1 input. There are no outputs as the execution is transferred to either the * if or the else branch based on the value of the input. * * Input is the condition predicate. Must be a scalar (length 1 tensor). * */ message BranchLayerParams { /** * execute this graph if the absolute value of the input Tensor is greater than 1e-6 * This must be present. */ NeuralNetwork ifBranch = 1; /** * execute this graph if the absolute value of the input Tensor is less than 1e-6 * This is optional. */ NeuralNetwork elseBranch = 2; } /** * Loop Layer * * A layer that provides the functionality of a "for" loop or a "while" loop. * * There are either no inputs or 1 input. When an input is present, it corresponds to the maximum loop count, * in that case the value of the "maxLoopIterations" field is ignored. Input must be a scalar. * (For description below, maxLoopIterations is assumed to be the value of the input, when its present) * * No outputs are produced. Blobs produced by the condition or the body network are visible in the scope of the overall network. * * "conditionNetwork" must produce a tensor with the name specified in the "conditionVar" field. * * There are 3 possible cases for determining the termination condition: * * Case 1: * * If there is no "conditionNetwork", in this case the layer corresponds to a pure for loop, which is run "maxLoopIterations" number of times. * Equivalent pseudo-code: * * for loopIterator = 0 : maxLoopIterations * bodyNetwork() * * * Case 2: * * "conditionNetwork" is present, and "maxLoopIterations" is 0 and there is no input, * in this case the layer corresponds to a while loop. Equivalent pseudo-code: * * conditionVar = conditionNetwork() * while conditionVar: * bodyNetwork() * conditionVar = conditionNetwork() * * * Case 3: * * "conditionNetwork" is provided, and "maxLoopIterations" is positive or there is an input, * in this case the layer corresponds to a while loop with a joint condition. Equivalent pseudo-code: * * loopIterator = 0 * conditionVar = conditionNetwork() * while (conditionVar and loopIterator < maxLoopIterations): * bodyNetwork() * loopIterator = loopIterator + 1 * conditionVar = conditionNetwork() * */ message LoopLayerParams { /** * maximum number of iterations. Ignored if input is present. */ uint64 maxLoopIterations = 1; /** * This field provides the name of the tensor which is produced by the conditionNetwork * and whose value is checked to start/continue/terminate the loop. Value close to 0.0f is treated as False. * This field is optional. * Must be a non empty string if and only if "conditionNetwork" is present. */ string conditionVar = 2; /** * Must generate a tensor with the name provided in the "conditionVar" field. * This field is optional. * Must be present if and only if "conditionVar" field is a non empty string. */ NeuralNetwork conditionNetwork = 3; /** * Body of the loop. * This field must be present. */ NeuralNetwork bodyNetwork = 4; } /** * Loop break Layer * * Terminate the loop that has this layer. * If present, it should always reside in the "bodyNetwork" of the loop layer * * No inputs/outputs * */ message LoopBreakLayerParams { } /** * Loop Continue Layer * * Stop the current loop iteration and continue on the next iteration. * If present, it should always reside in the "bodyNetwork" of the loop layer * * No inputs/outputs * */ message LoopContinueLayerParams { } /** * Copy Layer * * A layer that copies its input tensor to the output tensor. * Must have 1 input and 1 output, with distinct names. * This is the only layer that is allowed to re-generate an output that is already present in the neural network prior to this layer, * in which case it will overwrite the output tensor. * */ message CopyLayerParams { } /** * GreaterThan Layer * * Either 1 or 2 inputs. * Produces 1 output. * Perform elementwise greater than operation. * * Output is 1.0f if the condition is true otherwise 0.0f. * * .. code:: * * y = x1 > x2 * or * y = x1 > alpha, if only one input is provided * * Broadcasting is supported. * */ message GreaterThanLayerParams { /** * Compare to the scalar value provided here if there is 1 input */ float alpha = 2; } /** * GreaterEqual Layer * * Either 1 or 2 inputs. * Produces 1 output. * Perform elementwise greater equal operation. * * Output is 1.0f if the condition is true otherwise 0.0f. * * .. code:: * * y = x1 >= x2 * or * y = x1 >= alpha, if only one input is provided * * Broadcasting is supported. * */ message GreaterEqualLayerParams { /** * Compare to the scalar value provided here if there is 1 input */ float alpha = 2; } /** * LessThan Layer * * Either 1 or 2 inputs. * Produces 1 output. * Perform elementwise less than operation. * * Output is 1.0f if the condition is true otherwise 0.0f. * * .. code:: * * y = x1 < x2 * or * y = x1 < alpha, if only one input is provided * * Broadcasting is supported. * */ message LessThanLayerParams { /** * Compare to the scalar value provided here if there is 1 input */ float alpha = 2; } /** * LessEqual Layer * * Either 1 or 2 inputs. * Produces 1 output. * Perform elementwise less equal operation. * * Output is 1.0f if the condition is true otherwise 0.0f. * * .. code:: * * y = x1 <= x2 * or * y = x1 <= alpha, if only one input is provided * * Broadcasting is supported. * */ message LessEqualLayerParams { /** * Compare to the scalar value provided here if there is 1 input */ float alpha = 2; } /** * Equal Layer * * Either 1 or 2 inputs. * Produces 1 output. * Perform elementwise equal operation. * * Output is 1.0f if the condition is true otherwise 0.0f. * * .. code:: * * y = x1 == x2 * or * y = x1 == alpha, if only one input is provided * * Broadcasting is supported. * */ message EqualLayerParams { /** * Compare to the scalar value provided here if there is 1 input */ float alpha = 1; } /** * NotEqual Layer * * Either 1 or 2 inputs. * Produces 1 output. * Perform elementwise not equal operation. * * Output is 1.0f if the condition is true otherwise 0.0f. * * .. code:: * * y = x1 != x2 * or * y = x1 != alpha, if only one input is provided * * Broadcasting is supported. * */ message NotEqualLayerParams { /** * Compare to the scalar value provided here if there is 1 input */ float alpha = 1; } /** * LogicalAnd Layer * * Must have 2 inputs, produces 1 output. * Perform elementwise logical AND operation. * * Input is considered False if equal to 0.0f otherwise True. * Output is 1.0f if the condition is true otherwise 0.0f. * * .. code:: * * y = AND(x1, x2) * * Broadcasting is supported. * */ message LogicalAndLayerParams { } /** * LogicalOr Layer * * Must have 2 inputs, produces 1 output. * Perform elementwise logical OR operation. * * Input is considered False if equal to 0.0f otherwise True. * Output is 1.0f if the condition is true otherwise 0.0f. * * .. code:: * * y = OR(x1, x2) * * Broadcasting is supported. * */ message LogicalOrLayerParams { } /** * LogicalXor Layer * * Must have 2 inputs, produces 1 output. * Perform elementwise logical XOR operation. * * Input is considered False if equal to 0.0f otherwise True. * Output is 1.0f if the condition is true otherwise 0.0f. * * .. code:: * * y = XOR(x1, x2) * * Broadcasting is supported. * */ message LogicalXorLayerParams { } /** * LogicalNot Layer * * Must have 1 input, produces 1 output. * Perform elementwise logical NOT operation. * * Input is considered False if equal to 0.0f otherwise True. * Output is 1.0f if the condition is true otherwise 0.0f. * * .. code:: * * y = NOT(x) * * */ message LogicalNotLayerParams { } /// Border Amounts /// -------------- /** * Specifies the amount of spatial border to be either padded or cropped. * * For padding: * * .. code:: * * H_out = borderAmounts[0].startEdgeSize + H_in + borderAmounts[0].endEdgeSize * W_out = borderAmounts[1].startEdgeSize + W_in + borderAmounts[1].endEdgeSize * * topPaddingAmount == Height startEdgeSize * bottomPaddingAmount == Height endEdgeSize * leftPaddingAmount == Width startEdgeSize * rightPaddingAmount == Width endEdgeSize * * For cropping: * * .. code:: * * H_out = (-borderAmounts[0].startEdgeSize) + H_in + (-borderAmounts[0].endEdgeSize) * W_out = (-borderAmounts[1].startEdgeSize) + W_in + (-borderAmounts[1].endEdgeSize) * * topCropAmount == Height startEdgeSize * bottomCropAmount == Height endEdgeSize * leftCropAmount == Width startEdgeSize * rightCropAmount == Width endEdgeSize */ message BorderAmounts { message EdgeSizes { /** * The amount to be padded or cropped from the beginning. */ uint64 startEdgeSize = 1; /** * The amount to be padded or cropped from the end. */ uint64 endEdgeSize = 2; } /** * The border amounts. * This must be length 2 in the order [H, W]. */ repeated EdgeSizes borderAmounts = 10; } /** * Specifies the type of padding to be used with Convolution/Deconvolution and Pooling layers. * After padding, input spatial shape: [H_in, W_in], gets modified to the * output spatial shape [H_out, W_out]. * * .. code:: * * topPaddingAmount == Height startEdgeSize == borderAmounts[0].startEdgeSize * bottomPaddingAmount == Height endEdgeSize == borderAmounts[0].endEdgeSize * leftPaddingAmount == Width startEdgeSize == borderAmounts[1].startEdgeSize * rightPaddingAmount == Width endEdgeSize == borderAmounts[1].endEdgeSize * * With Convolution or Pooling: * * .. code:: * * H_out = int_division_round_down((H_in + topPaddingAmount + bottomPaddingAmount - KernelSize[0]),stride[0]) + 1 * * which is same as: * * .. code:: * * H_out = int_division_round_up((H_in + topPaddingAmount + bottomPaddingAmount - KernelSize[0] + 1),stride[0]) * * With Deconvolution: * * .. code:: * * H_out = (H_in-1) * stride[0] + kernelSize[0] - (topPaddingAmount + bottomPaddingAmount) * * * The equivalent expressions hold true for W_out as well. * * * By default, the values of paddingAmounts are set to 0, * which results in a "true" valid padding. * If non-zero values are provided for paddingAmounts, * "valid" convolution/pooling is performed within the spatially expanded input. * */ message ValidPadding { BorderAmounts paddingAmounts = 1; } /** * Specifies the type of padding to be used with Convolution/Deconvolution and pooling layers. * After padding, input spatial shape: [H_in, W_in], gets modified to the * output spatial shape [H_out, W_out]. * With Convolution or pooling: * * .. code:: * * H_out = int_division_round_up(H_in,stride[0]) * W_out = int_division_round_up(W_in,stride[1]) * * This is achieved by using the following padding amounts: * * .. code:: * * totalPaddingHeight = max(0,(H_out-1) * stride[0] + KernelSize[0] - Hin) * totalPaddingWidth = max(0,(W_out-1) * stride[1] + KernelSize[1] - Win) * * There are two modes of asymmetry: * BOTTOM_RIGHT_HEAVY, and TOP_LEFT_HEAVY. * * If the mode is BOTTOM_RIGHT_HEAVY: * * .. code:: * * topPaddingAmount = floor(totalPaddingHeight / 2) * bottomPaddingAmount = totalPaddingHeight - topPaddingAmount * leftPaddingAmount = floor(totalPaddingWidth / 2) * rightPaddingAmount = totalPaddingWidth - leftPaddingAmount * * If the mode is TOP_LEFT_HEAVY: * * .. code:: * * bottomPaddingAmount = floor(totalPaddingHeight / 2) * topPaddingAmount = totalPaddingHeight - bottomPaddingAmount * rightPaddingAmount = floor(totalPaddingWidth / 2) * leftPaddingAmount = totalPaddingWidth - rightPaddingAmount * * * With Deconvolution: * * .. code:: * * H_out = H_in * stride[0] * W_out = W_in * stride[1] */ message SamePadding { enum SamePaddingMode { BOTTOM_RIGHT_HEAVY = 0; TOP_LEFT_HEAVY = 1; } SamePaddingMode asymmetryMode = 1; } /** * Specifies how grid points are sampled from an interval. * Without the loss of generality, assume the interval to be [0, X-1] from which N points are to be sampled. * Here X may correspond to an input image's height or width. * All the methods can be expressed in terms of numpy's linspace function, along with the constraint that grid points have to lie in the interval [0, X-1]. * Note: numpy.linspace(start = start, end = end, num = N, endpoint = True) corresponds to sampling * N points uniformly from the interval [start, end], endpoints included. * The methods vary in how the start and end values are computed. */ message SamplingMode { enum Method { /** * start = 0, end = X-1 * grid points = numpy.linspace(start, end) */ STRICT_ALIGN_ENDPOINTS_MODE = 0; /** * if N == 1: start = end = (X-1)/2 * otherwise, start = 0, end = X-1 * grid points = numpy.linspace(start, end) */ ALIGN_ENDPOINTS_MODE = 1; /** * start = 0, end = X - X/N * grid points = min(X-1, numpy.linspace(start, end)) * This is same as the mode used in the upsample layer in this specification, when used with bilinear interpolation. In that case N/X = upsample ratio. */ UPSAMPLE_MODE = 2; /** * spacing = max(1, X-1)/N * start = 0.5 * spacing * end = start + (N-1) * spacing * grid points = min(X-1, numpy.linspace(start, end)) */ ROI_ALIGN_MODE = 3; } Method samplingMethod = 1; } /** * Specifies the convention used to specify four bounding box coordinates for an image of size (Height, Width). * The (0,0) coordinate corresponds to the top-left corner of the image. */ message BoxCoordinatesMode { enum Coordinates { /** * [h_start, w_start, h_end, w_end] */ CORNERS_HEIGHT_FIRST = 0; /** * [w_start, h_start, w_end, h_end] */ CORNERS_WIDTH_FIRST = 1; /** * [h_center, w_center, box_height, box_width] */ CENTER_SIZE_HEIGHT_FIRST = 2; /** * [w_center, h_center, box_width, box_height] */ CENTER_SIZE_WIDTH_FIRST = 3; } Coordinates boxMode = 1; } /** * Weights for layer parameters. * Weights are stored as repeated floating point numbers * using row-major ordering * and can represent 1-, 2-, 3-, or 4-dimensional data. */ message WeightParams { /** * Values specified in single / float / FP32 precision. */ repeated float floatValue = 1; /** * Values in 16-bit half precision floating point. */ bytes float16Value = 2; /** * Raw value specification for quantized lower precisions. * * This field is interpreted as uintN, where N is the number of bits in quantization. * E.g. if n=8, the field is interpreted as an array of UINT8. * Use this field for quantized parameters unless specifically noted to use * int8RawValue. */ bytes rawValue = 30; /** * Field to be used if int8DynamicQuantize is set in the parent layer. * Cannot be set if rawValue is also set. * The values in this field are interpreted as INT8. * * If this field is set, following conditions must hold true: * * QuantizationType == LinearQuantizationParams, such that * * size of the "scale" field is 1 and "bias" field is empty in "LinearQuantizationParams" */ bytes int8RawValue = 31; /** * Quantization related parameters. */ QuantizationParams quantization = 40; bool isUpdatable = 50; } /** * Quantization parameters. */ message QuantizationParams { uint64 numberOfBits = 1; oneof QuantizationType { LinearQuantizationParams linearQuantization = 101; LookUpTableQuantizationParams lookupTableQuantization = 102; } } message LinearQuantizationParams { /** * Stores scale and bias values corresponding to the quantized weights. * Must be an array of 1 element, or an array of C elements, where C * is number of output channels. For recurrent layers it is equal to * the output vector size. * * Relationship between quantized weights, unquantized weights, scale and bias: * * W_unquantized = W_quantized * scale + bias * */ repeated float scale = 1; repeated float bias = 2; } message LookUpTableQuantizationParams { /* Stores look-up table quantization values. Must be an array of (2^numberOfBits) Elements. */ repeated float floatValue = 1; } /// Layers /// ------ /** * A layer that performs spatial convolution or deconvolution. * * .. code:: * * y = ConvolutionLayer(x) * * Requires 1 or 2 inputs and produces 1 output. * * Input * First Input: * A blob with rank greater than or equal to 4. * Rank 4 blob represents [Batch, channels, height, width]. * For ranks greater than 4, the leading dimensions, starting from 0 to -4 (inclusive), are all treated as batch. * * From Core ML specification version 4 onwards (iOS >= 13, macOS >= 10.15). * convolution layer can have 2 inputs, in which case the second input is * the blob representing the weights. This is allowed when "isDeconvolution" = False. * The weight blob should have shape * [outputChannels, kernelChannels, kernelHeight, kernelWidth], * where kernelChannels == inputChannels / nGroups. * * Output * Rank is same as the input. e.g.: for rank 4 input, output shape is [B, C_out, H_out, W_out] * * * If dilationFactor is not 1, effective kernel size is * modified as follows: * * .. code:: * * KernelSize[0] <-- (kernelSize[0]-1) * dilationFactor[0] + 1 * KernelSize[1] <-- (kernelSize[1]-1) * dilationFactor[1] + 1 * * Type of padding can be valid or same. Output spatial dimensions depend on the * the type of padding. For details, refer to the descriptions of the messages "ValidPadding" * and "SamePadding". Padded values are all zeros. * * For Deconvolution, ConvolutionPaddingType (valid or same) is ignored when outputShape is set. * * */ message ConvolutionLayerParams { /** * The number of kernels. * Same as C_out used in the layer description. */ uint64 outputChannels = 1; /** * Channel dimension of the kernels. * Must be equal to inputChannels / nGroups, if isDeconvolution == False * Must be equal to inputChannels, if isDeconvolution == True */ uint64 kernelChannels = 2; /** * Group convolution, i.e. weight reuse along channel axis. * Input and kernels are divided into g groups * and convolution / deconvolution is applied within the groups independently. * If not set or 0, it is set to the default value 1. */ uint64 nGroups = 10; /** * Must be length 2 in the order [H, W]. * If not set, default value [3, 3] is used. */ repeated uint64 kernelSize = 20; /** * Must be length 2 in the order [H, W]. * If not set, default value [1, 1] is used. */ repeated uint64 stride = 30; /** * Must be length 2 in order [H, W]. * If not set, default value [1, 1] is used. * It is ignored if isDeconvolution == true. */ repeated uint64 dilationFactor = 40; /** * The type of padding. */ oneof ConvolutionPaddingType { ValidPadding valid = 50; SamePadding same = 51; } /** * Flag to specify whether it is a deconvolution layer. */ bool isDeconvolution = 60; /** * Flag to specify whether a bias is to be added or not. */ bool hasBias = 70; /** * Weights associated with this layer. * If convolution (isDeconvolution == false), weights have the shape * [outputChannels, kernelChannels, kernelHeight, kernelWidth], where kernelChannels == inputChannels / nGroups * If deconvolution (isDeconvolution == true) weights have the shape * [kernelChannels, outputChannels / nGroups, kernelHeight, kernelWidth], where kernelChannels == inputChannels */ WeightParams weights = 90; WeightParams bias = 91; /// Must be of size [outputChannels]. /** * The output shape, which has length 2 [H_out, W_out]. * This is used only for deconvolution (isDeconvolution == true). * If not set, the deconvolution output shape is calculated * based on ConvolutionPaddingType. */ repeated uint64 outputShape = 100; } /** * A layer that performs a 3-dimensional convolution. * * .. code:: * * y = Convolution3DLayer(x) * * Input * A blob of rank 5. * The input blob's shape should be [batch, channels, depth, height, width]. * * Fields * The bias field, if set, should have shape of [channelsOut]. * * Output * A blob of rank 5. * The output blob's shape is [batch, channelsOut, depthOut, heightOut, widthOut]. * * Type of padding can be custom, valid, or same. Padded values are all zeros. * Output spatial dimensions depend on the the type of padding. For details, refer to the * descriptions of the PaddingType field of this Convolution3DLayerParams message. * * Example * For example, given an input of size [1, 3, 3, 8, 8], a stride of 2 in each dimension, * a kernel of 3 in each dimension, 2 output channels, and same padding, this layer will * compute the total padding applied in the depth, height, and width dimensions to be 2, 1, and 1, * respectively. The depth padding is even and will be applied equally to both sides of the depth * dimension. Since the height and width padding values are odd, they'll be applied to the * bottom/right of the height/width dimensions. Thus, the padding applied to the input will be * [1, 1, 0, 1, 0, 1] (front, back, top, bottom, left, right). Finally, the output produced * will have size [1, 2, 2, 4, 4]. * */ message Convolution3DLayerParams { /** * The number of channels in the output (channelsOut). Must be a positive integer. */ int32 outputChannels = 1; /** * The number of channels in the input (channels). Must be a positive integer. */ int32 inputChannels = 2; /** * Group convolution, i.e., weight reuse along the channel axis. * It must evenly divide both the number of input and output channels and be at most the number * of input channels (a depthwise convolution). * Input and kernels are divided into g groups and convolution is applied within the groups * independently. */ int32 nGroups = 10; /* Depth of the convolution kernel. Must be a positive integer. */ int32 kernelDepth = 20; /* Height of the convolution kernel. Must be a positive integer. */ int32 kernelHeight = 21; /* Width of the convolution kernel. Must be a positive integer. */ int32 kernelWidth = 22; /* Stride along the depth direction. Must be a positive integer. */ int32 strideDepth = 31; /* Stride along the height direction. Must be a positive integer. */ int32 strideHeight = 32; /* Stride along the width direction. Must be a positive integer. */ int32 strideWidth = 33; /* Dilation along the depth direction. Must be a positive integer. */ int32 dilationDepth = 40; /* Dilation along the height direction. Must be a positive integer. */ int32 dilationHeight = 41; /* Dilation along the width direction. Must be a positive integer. */ int32 dilationWidth = 42; /** * Flag to specify whether a bias is to be added or not. * If false, then no bias is added. */ bool hasBias = 50; /** * Weights associated with this layer. * Weights have the shape * if deconvolution == False * [outputChannels, kernelChannels, kernelDepth, kernelHeight, kernelWidth], where * kernelChannels == inputChannels / nGroups * else if deconvolution == True * [outputChannels / nGroups, kernelChannels, kernelDepth, kernelHeight, kernelWidth], where */ WeightParams weights = 60; /** * Must be of size [outputChannels]. */ WeightParams bias = 61; /** * The type of padding. * All padding types pad the input shape with zeros. * CUSTOM padding will add the custom padding values specified below to their respective * dimensions, e.g., customPaddingFront number of zeros will be added to one side of the * input's depth dimension and customPaddingBack number of zeros will be added to the other * side of the input's depth dimension. * VALID padding adds no padding to any dimension. In this case, the last convolution along * each dimension will be dropped if the input dimension and the kernel size, stride, and * dilation do not match. * SAME padding adds enough padding to each dimension such that the output of the convolution * has size Ceiling(inputShape / stride). Padding is added evenly to both sides of each * dimension unless the total padding to add is odd, in which case it is added to the * back/bottom/right side of the respective dimension. For example, if the total padding needed * in the depth dimension is 3, 1 zero will be added to the front side of the depth dimension * and 2 zeros will be added to the back side. */ enum PaddingType { CUSTOM = 0; VALID = 1; SAME = 2; } PaddingType paddingType = 70; /* Padding before the input in the depth direction. Must be zero or a positive integer. * Used when the PaddingType is CustomPadding, otherwise ignored by other padding types. */ int32 customPaddingFront = 80; /* Padding after the input in the depth direction. Must be zero or a positive integer. * Used when the PaddingType is CustomPadding, otherwise ignored by other padding types. */ int32 customPaddingBack = 81; /* Padding before the input in the height direction. Must be zero or a positive integer. * Used when the PaddingType is CustomPadding, otherwise ignored by other padding types. */ int32 customPaddingTop = 82; /* Padding after the input in the height direction. Must be zero or a positive integer. * Used when the PaddingType is CustomPadding, otherwise ignored by other padding types. */ int32 customPaddingBottom = 83; /* Padding before the input in the width direction. Must be zero or a positive integer. * Used when the PaddingType is CustomPadding, otherwise ignored by other padding types. */ int32 customPaddingLeft = 84; /* Padding after the input in the width direction. Must be zero or a positive integer. * Used when the PaddingType is CustomPadding, otherwise ignored by other padding types. */ int32 customPaddingRight = 85; /* Flag to specify if this is Convolution Transpose or not. */ bool isDeconvolution = 86; /* * The output shape, which has length 3 [D_out, H_out, W_out]. * This is used only for deconvolution (isDeconvolution == true). * If not set, the deconvolution output shape is calculated * based on PaddingType. */ repeated uint64 outputShape = 87; } /** * A layer that performs a matrix-vector or matrix-matrix product. * This is equivalent to a fully-connected, or dense layer. * The weight parameters correspond to a matrix of dimensions (inputChannels, outputChannels) i.e. (C_in, C_out) * * .. code:: * * y = InnerProductLayer(x) * * Requires 1 input and produces 1 output. * * Input * Input can have rank 1 to rank 5. This is how it is reshaped in to the matrix (for rank > 1): * rank 1 (x1) : in this case, the layer corresponds to a matrix-vector product. x1 must be equal to C_in * rank 2 (x1, x2): x2 must be equal to C_in * rank 3 (x1, x2, x3) --> (x1 * x2, x3). x3 must be equal to C_in * rank 4 (x1, x2, x3, x4) ---> (x1, x2 * x3 * x4). x2 * x3 * x4 must be equal to C_in * rank 5 (x1, x2, x3, x4, x5) ---> (x1 * x2, x3 * x4 * x5). x3 * x4 * x5 must be equal to C_in * * Output * Output rank is same as the input rank * rank 1: (C_out) * rank 2: (x1, C_out) * rank 3: (x1, x2, C_out) * rank 4: (x1, C_out, 1, 1) * rank 5: (x1, x2, C_out, 1, 1) * */ message InnerProductLayerParams { uint64 inputChannels = 1; /// Input size: C_in. uint64 outputChannels = 2; /// Output size: C_out. bool hasBias = 10; /// Whether a bias is added or not. WeightParams weights = 20; /// Weight matrix [C_out, C_in]. WeightParams bias = 21; /// Bias vector [C_out]. /** * If set, this layer, at runtime, quantizes the floating point input blob to int8 before applying an * inner product using INT8 weight matrix parameters, as provided in weights->int8RawValue. The * result is then dequantized. * Requires: * * hasBias == false * * QuantizationType == LinearQuantizationParams, such that * * size of the "scale" field is 1 and "bias" field is empty in "LinearQuantizationParams" * * numberOfBits == 8 * * weights->rawValue_size to be empty */ bool int8DynamicQuantize = 22; } /** * A layer that performs a matrix lookup and optionally adds a bias. * The weights matrix is stored with dimensions [outputChannels, inputDim]. * * .. code:: * * y = EmbeddingLayer(x) * * Requires 1 input and produces 1 output. * * Input * Input values must be in the range [0, inputDim - 1]. * * Input must have rank equal to 4 or 5, such that the last 3 dimensions are all 1. * rank 4: shape (x1, 1, 1, 1). x1 is effectively the batch/sequence length. * rank 5: shape (x1, x2 , 1, 1, 1). x1 * x2 is effectively the combined batch/sequence length. * * Output * Output rank is same as the input rank. Please see input description above. * rank 4: shape (x1, outputChannels, 1, 1) * rank 5: shape (x1, x2, outputChannels, 1, 1) * */ message EmbeddingLayerParams { uint64 inputDim = 1; /// Size of the input dictionary. uint64 outputChannels = 2; /// Size of the output vectors. bool hasBias = 10; /// Whether a bias is added or not. WeightParams weights = 20; /// 2-D weights of dimensions [outputChannels, inputDim]. WeightParams bias = 21; /// Bias of size [outputChannels]. } /** * A layer that performs a matrix lookup and optionally adds a bias. * The weights matrix is stored with dimensions [embeddingSize, vocabSize]. * * .. code:: * * y = EmbeddingNDLayer(x) * * Requires 1 input and produces 1 output. * * Input * Input values must be in the range [0, vocabSize - 1]. * Input must have rank at least 2. The last dimension must always be 1. * rank 2: shape (x1, 1). x1 is the batch/sequence length. * rank 3: shape (x1, x2, 1). x1 * x2 is effectively the combined batch/sequence length. * rank 4: shape (x1, x2, x3, 1). x1 * x2 * x2 is effectively the combined batch/sequence length. * rank 5: shape (x1, x2 , x3, x4, 1). x1 * x2 * x3 * x4 is effectively the combined batch/sequence length. * * Output * Output rank is same as the input rank. Please see input description above. * rank 2: shape (x1, embeddingSize) * rank 3: shape (x1, x2, embeddingSize) * rank 4: shape (x1, x2, x3, embeddingSize) * rank 5: shape (x1, x2, x3, x4, embeddingSize) * */ message EmbeddingNDLayerParams { uint64 vocabSize = 1; /// Size of the input dictionary. uint64 embeddingSize = 2; /// Size of the output vectors. bool hasBias = 3; /// Whether a bias is added or not. WeightParams weights = 20; /// 2-D weights of dimensions [embeddingSize, vocabSize]. WeightParams bias = 21; /// Bias of size [embeddingSize]. } /** * A layer that performs batch normalization, * which is performed along axis = -3, * and repeated along the other axes, if present. * * .. code:: * * y = BatchnormLayer(x) * * Requires 1 input and produces 1 output. * * This operation is described by the following formula: * * .. math:: * y_i = \gamma_i \dfrac{ (x_i - \mu_i)}{\sqrt{\sigma_i^2 + \epsilon}} + \beta_i \;,\;i=1,....,C * * Input * A blob with rank greater than equal to 3. * Example: Rank 4 blob represents [Batch, channels, height, width] * For ranks greater than 3, the leading dimensions, starting from 0 to -4 (inclusive), are all treated as batch. * * Output * A blob with the same shape as the input. */ message BatchnormLayerParams { uint64 channels = 1; /// Size of the channel dimension in the input. /** * If computeMeanVar == true, * the mean and variance are calculated from either * the single input instance, if instanceNormalization == true, * or the whole batch, if instanceNormalization = false. * and the values provided in parameters "mean" and "variance" are ignored. */ bool computeMeanVar = 5; bool instanceNormalization = 6; /** * A small constant to avoid division by 0 while normalizing by variance. * Defaults to 1e-5 if not set or set to 0. */ float epsilon = 10; WeightParams gamma = 15; /// Parameter of length [channels] WeightParams beta = 16; /// Parameter of length [channels] WeightParams mean = 17; /// Parameter of length [channels] WeightParams variance = 18; /// Parameter of length [channels] } /** * A spatial pooling layer. * * .. code:: * * y = PoolingLayer(x) * * Requires 1 input and produces 1 output. * * Input * A blob with rank greater than equal to 4. * Rank 4 blob represents [Batch, channels, height, width] * For ranks greater than 4, the leading dimensions, starting from 0 to -4 (inclusive), are all treated as batch. * * Output * Rank is same as the input. e.g.: for rank 4 input, output shape is [B, C, H_out, W_out] * * Padding options are similar to ConvolutionLayerParams * with the additional option of ValidCompletePadding (includeLastPixel), * which ensures that the last application of the kernel * always includes the last pixel of the input image, if there is padding. * * .. code:: * * H_out = ceil(float(H_in + 2 * paddingAmounts[0] - kernelSize[0])/float(Stride[0])) + 1 * if (paddingAmounts[0] > 0 or paddingAmounts[1] > 0) * if ((H_out - 1) * Stride >= H_in + paddingAmounts[0]) { * H_out = H_out - 1 * } * } * * The equivalent expressions hold true for W_out as well. * Only symmetric padding is supported with this option. */ message PoolingLayerParams { enum PoolingType { MAX = 0; AVERAGE = 1; L2 = 2; } PoolingType type = 1; /// Type of pooling operation. /** * Must be length 2 in the order [H, W]. * If not set, default value [3, 3] is used. */ repeated uint64 kernelSize = 10; /** * Must be length 2 in the order [H, W]. * If not set, default value [1, 1] is used. */ repeated uint64 stride = 20; message ValidCompletePadding { /** * Must be length 2 in order [H, W]. * If not set, value [0, 0] is used. */ repeated uint64 paddingAmounts = 10; } oneof PoolingPaddingType { ValidPadding valid = 30; SamePadding same = 31; ValidCompletePadding includeLastPixel = 32; } /** * If true, padded values are excluded from the count (denominator) * when computing average pooling. */ bool avgPoolExcludePadding = 50; /** * If true, global pooling is performed. * Kernel size is inferred from the input data spatial dimensions. */ bool globalPooling = 60; } /* * A layer to pool three spatial dimensions * * Input * A blob with rank equal to 5, representing [Batch, channels, depth, height, width]. * * Output * Rank is same as the input: A blob with rank equal to 5, representing [Batch, channels, depth, height, width]. * * Requires 1 input and produces 1 output. * * For example, given an input of shape (1,1,2,3,3): * +----+----+----+ * / | 10 | 11 | 12 | * / +----+----+----+ * / | 13 | 14 | 15 | * / +----+----+----+ * / | 16 | 17 | 18 | * / +----+----+----+ * +----+----+----+ / * | 1 | 2 | 3 | / * +----+----+----+ / * | 4 | 5 | 6 | / * +----+----+----+ / * | 7 | 8 | 9 | / * +----+----+----+ * * And applying MAX pooling using: * Kernel: 2x2x2 * Stride: 1x1x1 * Valid Padding * We expect to get an output with shape: (1,1,1,2,2) and value: * +----+----+ * | 14 | 15 | * +----+----+ * | 17 | 18 | * +----+----+ */ message Pooling3DLayerParams { enum PoolingType3D { MAX = 0; AVERAGE = 1; } // Whether to use Max or Average PoolingType3D type = 1; // Depth of the pooling region. int32 kernelDepth = 2; // Height of the pooling region. int32 kernelHeight = 3; // Width of the pooling region. int32 kernelWidth = 4; // Stride along the depth direction int32 strideDepth = 5; // Stride along the height direction int32 strideHeight = 6; // Stride along the width direction int32 strideWidth = 7; /** * The type of padding. * All padding types pad the input shape with zeros. * CUSTOM padding will add the custom padding values specified below to their respective * dimensions, e.g., customPaddingFront number of zeros will be added to one side of the * input's depth dimension and customPaddingBack number of zeros will be added to the other * side of the input's depth dimension. * VALID padding adds no padding to any dimension. In this case, the last pool along * each dimension will be dropped if the input dimension and the kernel size, and stride do not match. * SAME padding adds enough padding to each dimension such that the output * has the same spatial dimensions as the input. Padding is added evenly to both * sides of each dimension unless the total padding to add is odd, in which case the extra padding * is added to the back/bottom/right side of the respective dimension. For example, if the the * total horizontal padding is 3, then there will be 1 padding on the left, and 2 padding on the right. */ enum Pooling3DPaddingType { CUSTOM = 0; VALID = 1; SAME = 2; } Pooling3DPaddingType paddingType = 15; // Padding before the input in the depth direction. int32 customPaddingFront = 8; // Padding after the input in the depth direction. int32 customPaddingBack = 9; // Padding before the input in the height direction. int32 customPaddingTop = 10; // Padding after the input in the height direction. int32 customPaddingBottom = 11; // Padding before the input in the width direction. int32 customPaddingLeft = 12; // Padding after the input in the width direction. int32 customPaddingRight = 13; // If true, exclude zeros from padding in Average pooling. Meaningless in Max Pooling. bool countExcludePadding = 14; } /* * A layer to pool three spatial dimensions down to one value. * This behaves like a special case of Pooling3DLayerParams in which * the Kernel is the size of the input and there is no padding. * * Input * A blob with rank equal to 5, representing [Batch, channels, depth, height, width]. * * Output * Rank is same as the input: A blob with rank equal to 5, representing [Batch, channels, depth, height, width]. * Depth, height, and width of the output will always be 1. * * Requires 1 input and produces 1 output. * * For example, given an input of shape (1,1,2,3,3): * +----+----+----+ * / | 10 | 11 | 12 | * / +----+----+----+ * / | 13 | 14 | 15 | * / +----+----+----+ * / | 16 | 17 | 18 | * / +----+----+----+ * +----+----+----+ / * | 1 | 2 | 3 | / * +----+----+----+ / * | 4 | 5 | 6 | / * +----+----+----+ / * | 7 | 8 | 9 | / * +----+----+----+ * * And applying MAX global 3d pooling, we expect to get an output with shape: (1,1,1,1,1) and value: * +----+ * | 18 | * +----+ */ message GlobalPooling3DLayerParams { enum GlobalPoolingType3D { MAX = 0; AVERAGE = 1; } // Whether to use Max or Average GlobalPoolingType3D type = 1; } /** * A layer that performs padding along spatial dimensions. * * .. code:: * * y = PaddingLayer(x) * * Requires 1 input and produces 1 output. * * Input * A blob with rank at least 2. * e.g.: blob with shape [H_in, W_in]. * For ranks greater than 2, the leading dimensions, starting from 0 to -4 (inclusive), are all treated as batch * i.e. Padding is applied on last two dimensions. * * Output * Same rank as the input. * e.g.: blob with shape [H_out, W_out]. * * Output dimensions are calculated as follows: * * .. code:: * * H_out = H_in + topPaddingAmount + bottomPaddingAmount * W_out = W_in + leftPaddingAmount + rightPaddingAmount * * topPaddingAmount == Height startEdgeSize == borderAmounts[0].startEdgeSize * bottomPaddingAmount == Height endEdgeSize == borderAmounts[0].endEdgeSize * leftPaddingAmount == Width startEdgeSize == borderAmounts[1].startEdgeSize * rightPaddingAmount == Width endEdgeSize == borderAmounts[1].endEdgeSize * * There are three types of padding: * * - PaddingConstant, which fills a constant value at the border. * - PaddingReflection, which reflects the values at the border. * - PaddingReplication, which replicates the values at the border. * * Given the following input: * * .. code:: * * [1, 3, 4] : 1 2 3 4 * 5 6 7 8 * 9 10 11 12 * * Here is the output of applying the padding * (top=2, left=2, bottom=0, right=0) * with each of the supported types: * * - PaddingConstant (value = 0): * .. code:: * * [1, 5, 6] : 0 0 0 0 0 0 * 0 0 0 0 0 0 * 0 0 1 2 3 4 * 0 0 5 6 7 8 * 0 0 9 10 11 12 * * - PaddingReflection: * .. code:: * * [1, 5, 6] : 11 10 9 10 11 12 * 7 6 5 6 7 8 * 3 2 1 2 3 4 * 7 6 5 6 7 8 * 11 10 9 10 11 12 * * - PaddingReplication: * .. code:: * * [1, 5, 6] : 1 1 1 2 3 4 * 1 1 1 2 3 4 * 1 1 1 2 3 4 * 5 5 5 6 7 8 * 9 9 9 10 11 12 */ message PaddingLayerParams { /** * Fill a constant value in the padded region. */ message PaddingConstant { float value = 1; } /** * Reflect the values at the border for padding. */ message PaddingReflection { } /** * Replicate the values at the border for padding. */ message PaddingReplication { } oneof PaddingType { PaddingConstant constant = 1; PaddingReflection reflection = 2; PaddingReplication replication = 3; } BorderAmounts paddingAmounts = 10; /// Amounts to be padded to the input. } /** * A layer that concatenates along the axis = -3 or -5. * For general concatenation along any axis, see ConcatNDLayer. * * .. code:: * * y = ConcatLayer(x1,x2,....) * * Requires more than 1 input and produces 1 output. * * Input * All input blobs must have same rank. * If "sequenceConcat" = False, rank must be greater than equal to 3. In this case concatenation is along axis = -3 * If "sequenceConcat" = True, rank must be greater than equal to 5. In this case concatenation is along axis = -5 * * Output * Same rank as the input. * */ message ConcatLayerParams { /** * If true, concatenate along the axis = -5 instead of axis = -3. */ bool sequenceConcat = 100; } /** * A layer that performs local response normalization (LRN). * * .. code:: * * y = LRNLayer(x) * * Requires 1 input and produces 1 output. * * Input * A blob with rank greater than equal to 3. * Example: Rank 4 blob represents [Batch, channels, height, width] * For ranks greater than 3, the leading dimensions, starting from 0 to -4 (inclusive), are all treated as batch. * Output * A blob with the same shape as the input. * * This layer is described by the following formula: * * .. math:: * x_i \leftarrow \dfrac{x_i}{\left ( k + \dfrac{\alpha}{\text{localSize}} \sum_j x_j^2 \right )^\beta} * * where the summation is done over a (localSize, 1, 1) neighborhood --- * that is, over a window "across" channels in 1x1 spatial neighborhoods. */ message LRNLayerParams { float alpha = 1; float beta = 2; uint64 localSize = 3; /// Number of channels in the normalization window. float k = 4; /// Defaults to 1 if not set or 0. Must be strictly positive. } /** * Softmax Normalization Layer * * A layer that performs softmax normalization. * Normalization is applied along axis = -3 or N-3 (where N is the rank of the input) * For softmax layer that can operate on any axis, see SoftmaxNDLayer. * * * .. code:: * * y = SoftmaxLayer(x) * * Requires 1 input and produces 1 output. * * Input * Must be a blob with rank >= 3. * Output * A blob with the same shape as the input. * * This layer is described by the following formula: * * .. math:: * x_i \leftarrow \dfrac{e^{x_i}}{\sum_i{e^{x_i}}} */ message SoftmaxLayerParams { } /** * A layer that uniformly splits across axis = -3 to produce a specified number of outputs. * For general split operation along any axis, see SplitNDLayer. * * .. code:: * * (y1,y2,...yN) = SplitLayer(x), where N = nOutputs * * Requires 1 input and produces multiple outputs. * * Input * A blob with rank at least 3. * e.g.: blob with shape [C, H, W] * Output * nOutputs blobs each with same rank as the input. * e.g.: For input that is of shape [C, H, W], output shapes will be [C/nOutputs, H, W] */ message SplitLayerParams { uint64 nOutputs = 1; /// The number of outputs. } /** * A layer that performs elementwise addition. * This layer has limited broadcasting support. For general broadcasting see AddBroadcastableLayer. * * .. code:: * * y = AddLayer(x1,x2,...) * * Requires 1 or more than 1 input and produces 1 output. * * Input * In general, there are no rank constraints. * However, only certain set of shapes are broadcastable. For example: * [B, 1, 1, 1], [B, C, 1, 1], [B, 1, H, W], [B, C, H, W] * Output * A blob with shape equal to the input blob. * * If only one input is provided, scalar addition is performed: * * .. math:: * y = x + \alpha * */ message AddLayerParams { /** * Scalar to be added to the input. * Only used if there is a single input. */ float alpha = 1; } /** * A layer that performs elementwise multiplication. * This layer has limited broadcasting support. For general broadcasting see MultiplyBroadcastableLayer. * * .. code:: * * y = MultiplyLayer(x1,x2,...) * * Requires 1 or more than 1 input and produces 1 output. * * Input * In general, there are no rank constraints. * However, only certain set of shapes are broadcastable. For example: * [B, 1, 1, 1], [B, C, 1, 1], [B, 1, H, W], [B, C, H, W] * Output * A blob with shape equal to the first input blob. * * If only one input is provided, scalar multiplication is performed: * * .. math:: * y = \alpha x * */ message MultiplyLayerParams { /** * Scalar to be multiplied with the input. * Only used if there is a single input. */ float alpha = 1; } /** * A layer that applies a unary function. * * .. code:: * * y = UnaryFunctionLayer(x) * * Requires 1 input and produces 1 output. * * Input * A blob with no rank constraints. * Output * A blob with the same shape as the input. * * The input is first modified by shifting and scaling: * * .. math:: * x \leftarrow \text{scale} \cdot x + \text{shift} */ message UnaryFunctionLayerParams { /** * A unary operator. * * The following functions are supported: * * SQRT * .. math:: f(x) = \sqrt{x} * * RSQRT * .. math:: f(x) = \dfrac{1}{\sqrt{x + \epsilon}} * * INVERSE * .. math:: f(x) = \dfrac{1}{x + \epsilon} * * POWER * .. math:: f(x) = x^\alpha * * EXP * .. math:: f(x) = e^x * * LOG * .. math:: f(x) = \log x * * ABS * .. math:: f(x) = |x| * * THRESHOLD * .. math:: f(x) = \text{max}(\alpha, x) */ enum Operation { SQRT = 0; RSQRT = 1; INVERSE = 2; POWER = 3; EXP = 4; LOG = 5; ABS = 6; THRESHOLD = 7; } Operation type = 1; /// The type of unary function. /** * A constant used in POWER and THRESHOLD functions. */ float alpha = 2; /** * A small constant to avoid division by 0 while normalizing variance. * Defaults to 1e-6 if not set or set to 0. */ float epsilon = 3; /** * Input is shifted by this amount * before the unary function is applied. * Defaults to 0.0 if not set. */ float shift = 4; /** * Input is scaled by this amount * before the unary function is applied. * Defaults to 1.0 if not set or set to 0. */ float scale = 5; } /** * A layer that scales up spatial dimensions. * It supports two modes: nearest neighbour (default) and bilinear. * * .. code:: * * y = UpsampleLayer(x) * * Requires 1 input and produces 1 output. * * Input * A blob with rank at least 3. * e.g.: blob with shape [C, H, W]. * For ranks greater than 3, the leading dimensions, starting from 0 to -4 (inclusive), are all treated as batch. * * Output * Same rank as the input. * e.g.: blob with shape [C, scalingFactor[0] * H, scalingFactor[1] * W] */ message UpsampleLayerParams { /** * Scaling Factor. Mutually exclusive with fractionalScalingFactor. * Must be length 2 in order [H, W]. * If not set, default value [1, 1] is used. */ repeated uint64 scalingFactor = 1; /** * Fractional scaling factor. Mutually exclusive with scalingFactor. * Must be length 2 in order [H, W]. * If not set, default value [1.0, 1.0] is used. */ repeated float fractionalScalingFactor = 7; /* * Overall mode for interpolating new elements when upsampling. * NN - Nearest Neighbors - simply pick the nearest true value for interpolated values. * BILINEAR - Use bilinear interpolation. See LinearUpsamplingMode for behavior. */ enum InterpolationMode { NN = 0; /// Nearest Neighbour BILINEAR = 1; /// Bilinear } InterpolationMode mode = 5; /** * LinearUpsampleMode specifies the behavior for linear upsampling. Only valid when Interpolation Mode is BILINEAR. * If input grid is [0, Xin-1] (corresponding to an input size of Xin), and if the output size is Xout, * then the grid points are sampled in the following manner: * DEFAULT: * spacing = (Xin-Xin/Xout) / (Xout-1) * grid_point[i] = min(Xin-1, max(0, i * spacing)), for i = 0,1,2,….,Xout-1 * ALIGN_CORNERS_TRUE: * spacing = (Xin-1) / (Xout-1) * grid_point[i] = min(Xin-1, max(0, i * spacing)), for i = 0,1,2,….,Xout-1 * ALIGN_CORNERS_FALSE: * spacing = Xin / Xout * grid_point[i] = min(Xin-1, max(0, i * spacing + 0.5 * spacing - 0.5)), for i = 0,1,2,….,Xout-1 */ enum LinearUpsampleMode { DEFAULT = 0; ALIGN_CORNERS_TRUE = 1; ALIGN_CORNERS_FALSE = 2; } LinearUpsampleMode linearUpsampleMode = 6; } /** * A layer that resizes the input to a pre-specified spatial size using bilinear interpolation. * * .. code:: * * y = ResizeBilinearLayer(x) * * Requires 1 input and produces 1 output. * * Input * A blob with rank at least 3. * e.g.: blob with shape [C, H_in, W_in]. * For ranks greater than 3, the leading dimensions, starting from 0 to -4 (inclusive), are all treated as batch. * * Output * Same rank as the input. * e.g.: blob with shape [C, H_out, W_out]. * */ message ResizeBilinearLayerParams { /** * Target Spatial Size. * Must be length 2 in order [Height, Width], i.e. [H_out, W_out]. * If not set, default value [1, 1] is used. */ repeated uint64 targetSize = 1; /** * Mode used to compute the grid on which the spatial output values are evaluated. * Same mode is applied to both the height and width axes. */ SamplingMode mode = 2; } /** * A layer that extracts cropped spatial patches or RoIs (regions of interest) from the input and resizes them to a pre-specified size using * bilinear interpolation. * Note that RoI Align layer can be implemented with this layer followed by a pooling layer. * * .. code:: * * y = CropResizeLayer(x) * * Requires 2 inputs and produces 1 output. * * Input * There are two inputs. * First input represents an image feature map. * Second input represents the bounding box coordinates for N patches or RoIs (region of interest). * * First input is rank 5: [1, Batch, C, H_in, W_in]. * Second input is rank 5. Its shape can be either [N, 1, 4, 1, 1] or [N, 1, 5, 1, 1]. * * N: number of patches/RoIs to be extracted * * If RoI shape = [N, 1, 4, 1, 1] * The axis=-3 corresponds to the four coordinates specifying the bounding box. * All the N RoIs are extracted from all the batches of the input. * * If RoI shape = [N, 1, 5, 1, 1] * The first element of the axis=-3 specifies the input batch id from which to extract the RoI and * must be in the interval [0, Batch - 1]. That is, n-th RoI is extracted from the RoI[n,0,0,0,0]-th * input batch id. The last four elements of the axis=-3 specify the bounding box coordinates. * * Output * A blob with rank 5. * - Shape is [N, Batch, C, H_out, W_out] if input RoI shape is [N, 1, 4, 1, 1] * - Shape is [N, 1, C, H_out, W_out] if input RoI shape is [N, 1, 5, 1, 1] * */ message CropResizeLayerParams { /** * Target Spatial Size. * Must be length 2 in order [Height, Width], i.e. [H_out, W_out]. * If not set, default value [1, 1] is used. */ repeated uint64 targetSize = 1; /** * If true the bounding box coordinates must be in the interval [0, 1]. * They are scaled by (H_in - 1), (W_in - 1), i.e. based on the input spatial dimensions. * If false the bounding box coordinates must be in the interval * [0, H_in -1] and [0, W_in - 1], respectively for height and width dimensions. */ bool normalizedCoordinates = 2; /** * Mode used to compute the grid on which the spatial output values are evaluated. * Same mode is applied to both the height and width axes. */ SamplingMode mode = 3; /** * Representation used to express the bounding box coordinates. * It determines how the values of the second input are interpreted. */ BoxCoordinatesMode boxIndicesMode = 4; /** * Additional spatial scale that multiplies the bounding box coordinates. * Generally used while implementing the RoI Align layer, * which uses unnormalized RoI coordinates along with a spatial scale less than or equal to 1. */ float spatialScale = 5; } /** * A layer that performs elementwise addition of a bias, * which is broadcasted to match the input shape. * * .. code:: * * y = BiasLayer(x) * * Requires 1 input and produces 1 output. * * Input * A blob with rank at least 3. * e.g.: blob with shape [C, H, W]. * For ranks greater than 3, the leading dimensions, starting from 0 to -4 (inclusive), are all treated as batch. * Output * A blob with the same shape as the input. */ message BiasLayerParams { /** * The shape of the bias. * Must be one of the following: * [1], [C], [1, H, W] or [C, H, W]. */ repeated uint64 shape = 1; /** * The bias values. * The size must be equal to the product of the shape dimensions. */ WeightParams bias = 2; } /** * A layer that performs elmentwise multiplication by a scale factor * and optionally adds a bias; * both the scale and bias are broadcasted to match the input shape. * * .. code:: * * y = ScaleLayer(x) * * Requires 1 input and produces 1 output. * * Input * A blob with rank at least 3. * e.g.: blob with shape [C, H, W]. * For ranks greater than 3, the leading dimensions, starting from 0 to -4 (inclusive), are all treated as batch. * Output * A blob with the same shape as the input. */ message ScaleLayerParams { /** * The shape of the scale. * Must be one of the following: * [1], [C], [1, H, W] or [C, H, W]. */ repeated uint64 shapeScale = 1; /** * The scale values. * The size must be equal to the product of the shape dimensions. */ WeightParams scale = 2; /// Scale values. Size must be equal to the product of dimensions specified in shapeScale. bool hasBias = 3; /// If true, a bias is added after scaling. /** * The shape of the bias. * Must be one of the following: * [1], [C], [1, H, W] or [C, H, W]. */ repeated uint64 shapeBias = 4; /** * The bias values. * The size must be equal to the product of the shape dimensions. */ WeightParams bias = 5; } /** * A layer that loads data as a parameter and provides it as an output. * The output is rank 5. For general rank, see LoadConstantNDLayer. * * .. code:: * * y = LoadConstantLayer() * * Requires no input and produces 1 output. * * Output: * A blob with rank 5 and shape [1, 1, C, H, W] */ message LoadConstantLayerParams { /** * The shape of the constant to be loaded, * which must be[C, H, W], that is length 3. */ repeated uint64 shape = 1; /** * The data values, * of size C * H * W. */ WeightParams data = 2; } /** * A layer that performs L2 normalization, i.e. divides by the * the square root of the sum of squares of all elements of input. * * .. code:: * * y = L2NormalizeLayer(x) * * Requires 1 input and produces 1 output. * * Input * A blob with rank greater than equal to 3. * For ranks greater than 3, the leading dimensions, starting from 0 to -4 (inclusive), are all treated as batch. * Output * A blob with the same shape as the input. * * This layer is described by the following formula: * * .. math:: * x_i \leftarrow \dfrac{x_i}{\sqrt{\sum{x_i^2} + \epsilon}} */ message L2NormalizeLayerParams { /** * A small constant to avoid division by 0 while normalizing variance. * Defaults to 1e-6 if not set or set to 0. */ float epsilon = 1; } /// Data Reorganization Layers /// -------------------------- /** * A layer that flattens the input. * * .. code:: * * y = FlattenLayer(x) * * Requires 1 input and produces 1 output. * * Input * A blob with rank greater than equal to 3. * e.g.: Rank 4 blob represents [Batch, C, H, W] * For ranks greater than 3, the leading dimensions, starting from 0 to -4 (inclusive), are all treated as batch. * Output * Same rank as the input, such that last two dimensions are both 1. * e.g.: For rank 4 input, output shape is [Batch, C * H * W, 1, 1] * * There are two X orders: CHANNEL_FIRST and CHANNEL_LAST. * CHANNEL_FIRST does not require data to be rearranged, * because row major ordering is used by internal storage. * CHANNEL_LAST requires data to be rearranged. */ message FlattenLayerParams { enum FlattenOrder { CHANNEL_FIRST = 0; CHANNEL_LAST = 1; } FlattenOrder mode = 1; } /** * A layer that recasts the input into a new shape. * * .. code:: * * y = ReshapeLayer(x) * * Requires 1 input and produces 1 output. * * Input * A blob with rank 5. * e.g.: [1, 1, C, H, W] or [Seq, 1, C, H, W]. * Output * A blob with rank 5. * e.g.: [1, 1, C_out, H_out, W_out] or [Seq_out, 1, C_out, H_out, W_out]. * * There are two reshape orders: CHANNEL_FIRST and CHANNEL_LAST. * CHANNEL_FIRST is equivalent to * flattening the input to [Seq, 1, C * H * W, 1, 1] in channel first order * and then reshaping it to the target shape; * no data rearrangement is required. * CHANNEL_LAST is equivalent to * flattening the input to [Seq, 1, H * W * C, 1, 1] in channel last order, * reshaping it to [Seq_out, 1, H_out, W_out, C_out] (it is now in "H_out-major"" order), * and then permuting it to [C_out, H_out, W_out]; * both the flattening and permuting requires the data to be rearranged. */ message ReshapeLayerParams { /** * The shape of the output. * Must be of length 3 or 4. * If set to 3, targetShape is interpreted as * [1, 1, C_out, H_out, W_out], and sequence length of the input is preserved. * If set to 4, targetShape is interpreted as * [Seq_out, 1, C_out, H_out, W_out], * where Seq_out is the new sequence length. */ repeated int64 targetShape = 1; enum ReshapeOrder { CHANNEL_FIRST = 0; CHANNEL_LAST = 1; } ReshapeOrder mode = 2; } /** * A layer that rearranges the dimensions and data of an input. * For generic transpose/permute operation see TransposeLayer. * * .. code:: * * y = PermuteLayer(x) * * Requires 1 input and produces 1 output. * * Input * Must be a rank 5 blob. * e.g.: shape [Seq, B, C, H, W]. * Output * Rank 5 blob. Transposed version of the input, such that dimensions at axis=1 or axis=-4 is unchanged. * * * Examples: * * Assume input shape is [Seq, B, C, H, W] * * - If axis is set to [0, 3, 1, 2], * then the output has shape [Seq, B, W, C, H] * * - If axis is set to [3, 1, 2, 0], * then the output has shape [W, B, C, H, Seq] * * - If axis is set to [0, 3, 2, 1], * then the output has shape [Seq, B, W, H, C] * * - If axis is not set, or is set to [0, 1, 2, 3], * the output is the same as the input. */ message PermuteLayerParams { /** * The order in which to permute the dimensions. * Must have length 4 and a permutation of [0, 1, 2, 3]. */ repeated uint64 axis = 1; } /** * A layer that reorganizes data in the input in specific ways. * * .. code:: * * y = ReorganizeDataLayer(x) * * Requires 1 input and produces 1 output. * * Input * A blob with rank at least 3. * e.g.: blob with shape [C, H, W]. * For ranks greater than 3, the leading dimensions, starting from 0 to -4 (inclusive), are all treated as batch. * Output * Same rank as the input. * e.g.: blob with shape [C_out, H_out, W_out]. * * mode == SPACE_TO_DEPTH * [C_out, H_out, W_out] : [C * blockSize * blockSize, H/blockSize, W/blockSize]. * blockSize must divide H and W. * Data is moved from the spatial dimensions to the channel dimension. Input is spatially divided into * non-overlapping blocks of size blockSize X blockSize and data from each block is moved into the * channel dimension. * * mode == DEPTH_TO_SPACE * [C_out, H_out, W_out] : [C/(blockSize * blockSize), H * blockSize, W * blockSize]. * Square of blockSize must divide C. * Reverse of SPACE_TO_DEPTH. Data is moved from the channel dimension to the spatial dimensions. * * mode == PIXEL_SHUFFLE * [C_out, H_out, W_out] : [C/(blockSize * blockSize), H * blockSize, W * blockSize]. * Square of blockSize must divide C. * Similar to DEPTH_TO_SPACE, but using the pixel-shuffle semantics for channel order in the output space. * In both modes, elements along the channel dimension are collapsed into * blocks in the spatial dimensions. The difference is in the arrangement of * the input-channels' data in the output space. See below example for more * detail. * (Only available in Core ML Specification >= 5 (iOS >= 14, macOS >= 11.0) * * * Examples: * * Assume input is the following [C = 8, H = 1, W = 2] tensor: * * .. code:: * * [[[1 2]] [[3 4]] [[5 6]] [[7 8]] [[9 10]] [[11 12]] [[13 14]] [[15 16]]] * * If block_size == 2 and mode == DEPTH_TO_SPACE, output will be the following * [C = 2, H = 2, W = 4] tensor: * * .. code:: * * [[[ 1 5 2 6] * [ 9 13 10 14]] * * [[ 3 7 4 8] * [11 15 12 16]]] * * For mode == SPACE_TO_DEPTH, the behavior is the same as mode == * DEPTH_TO_SPACE, but with the input and output swapped. * * If block_size == 2 and mode == PIXEL_SHUFFLE, output will be the following * [C = 2, H = 2, W = 4] tensor: * * .. code:: * * [[[ 1 3 2 4] * [ 5 7 6 8]] * * [[ 9 11 10 12] * [13 15 14 16]]] * */ message ReorganizeDataLayerParams { enum ReorganizationType { SPACE_TO_DEPTH = 0; DEPTH_TO_SPACE = 1; PIXEL_SHUFFLE = 2; } ReorganizationType mode = 1; uint64 blockSize = 2; /// must be greater than 1 } /** * A layer that slices the input data along axis = -1 or -2 or -3. * For general slice along any axis, please see SliceStaticLayer/SliceDynamicLayer. * * .. code:: * * y = SliceLayer(x) * * Requires 1 input and produces 1 output. * * Input * A blob that can, in general, have any rank. However, depending on the value of "axis" , * there may be additional rank constraints. * Output * A blob with the same rank as the input. * * Sliced section is taken from the interval [startIndex, endIndex), i.e. * startIndex is inclusive while endIndex is exclusive. * stride must be positive and represents the step size for slicing. * Negative indexing is supported for startIndex and endIndex. * -1 denotes N-1, -2 denotes N-2 and so on, where N is the length of the dimension to be sliced. * */ message SliceLayerParams { int64 startIndex = 1; /// start of the sliced section. Inclusive. int64 endIndex = 2; /// end of sliced section. Exclusive. uint64 stride = 3; /// The step size. Must be positive. enum SliceAxis { CHANNEL_AXIS = 0; HEIGHT_AXIS = 1; WIDTH_AXIS = 2; } // The following mapping is used for interpreting this parameter: // CHANNEL_AXIS => axis = -3, input must have rank at least 3. // HEIGHT_AXIS => axis = -2, input must have rank at least 2. // WIDTH_AXIS => axis = -1 SliceAxis axis = 4; } /** * A layer that reduces the input using a specified operation. * * .. code:: * * y = ReduceLayer(x) * * Requires 1 input and produces 1 output. * * Input * A blob that can, in general, have any rank. However, depending on the value of "axis" , * there may be additional rank constraints. * Output * A blob with the same rank as the input, which has 1s on the dimensions specified in the parameter "axis" * * Values supported for axis are [-1], [-2], [-3], [-2,-1], [-3,-2,-1] * and the equivalent positive values (depending on the rank of the input) * For mode == 'ArgMax', axis must be [-1] or [-2] or [-3]. */ message ReduceLayerParams { /* * The following reduction operations are supported * and are applied on the specified axis of the input array: * * SUM * Sum of all elements * * .. math:: \sum{x_i} * * AVG * Sum of all elements divided by the number of elements * * .. math:: \dfrac{\sum^n{x_i}}{n} * * PROD * Product of all elements * * .. math:: \prod{x_i} * * LOGSUM * Sum of the natural logarithm of all elements * * .. math:: \sum{\ln{(x_i + \epsilon)}} * * SUMSQUARE * Sum of squares of all elements * * .. math:: \sum{x^2} * * L1 * L1 normalization of all elements * * .. math:: ||x||_1 = \sum{|x_i|} * * L2 * L2 normalization of all elements * * .. math:: ||x||_2 = \sqrt{\sum{x_i^2}} * * MAX * Maximum of all elements * * .. math:: \text{max}(x_i) * * MIN * Minumum of all elements * * .. math:: \text{min}(x_i) * * ARGMAX * Argument of the maximum of all elements * * .. math:: \text{argmax}(x_i) * */ enum ReduceOperation { SUM = 0; AVG = 1; PROD = 2; LOGSUM = 3; SUMSQUARE = 4; L1 = 5; L2 = 6; MAX = 7; MIN = 8; ARGMAX = 9; /// only supported with axis = C, H or W. } ReduceOperation mode = 1; /// Specifies function used to reduce. /** * Used if mode is LOGSUM. * Defaults to 1e-6 if not set or is set to 0. */ float epsilon = 2; enum ReduceAxis { CHW = 0; HW = 1; C = 2; H = 3; W = 4; } // The following mapping is used for interpreting this parameter: // CHW = axis [-3, -2, -1], input must have rank at least 3. // HW = axis [-2, -1], input must have rank at least 2. // C = axis [-3] // H = axis [-2] // W = axis [-1] ReduceAxis axis = 3; } /** * A layer that crops the spatial dimensions of an input. * If two inputs are provided, the shape of the second input is used as the reference shape. * * .. code:: * * y = CropLayer(x1) or y = CropLayer(x1,x2) * * Requires 1 or 2 inputs and produces 1 output. * * Input * 1 or 2 tensors, each with rank at least 3, both inputs must have equal rank. * Example: * - 1 input case: A blob with shape [C, H_in, W_in]. * - 2 input case: 1st blob with shape [C, H_in, W_in], 2nd blob with shape [C, H_out, W_out]. * * For ranks greater than 3, the leading dimensions, starting from 0 to -4 (inclusive), are all treated as batch. * * Output * Same rank as the inputs. * e.g.: A blob with shape [C, H_out, W_out]. * * If one input is used, output is computed as follows: * * .. code:: * * y = x1[:, topCropAmount:H_in - bottomCropAmount, leftCropAmount:W_in - rightCropAmount] * * topCropAmount == Height startEdgeSize == borderAmounts[0].startEdgeSize * bottomCropAmount == Height endEdgeSize == borderAmounts[0].endEdgeSize * leftCropAmount == Width startEdgeSize == borderAmounts[1].startEdgeSize * rightCropAmount == Width endEdgeSize == borderAmounts[1].endEdgeSize * * H_out = H_in - topCropAmount - bottomCropAmount * W_out = W_in - leftCropAmount - rightCropAmount * * If two inputs are used, output is computed as follows: * * .. code:: * * y = x1[:, offset[0]:offset[0] + H_out, offset[1]:offset[1] + W_out] */ message CropLayerParams { /** * The amounts to be cropped from the input. * Used only if a single input is provided. */ BorderAmounts cropAmounts = 1; /** * The offset amounts. * Used only if two inputs are provided. * Must be of length 2, in order [H, W]. */ repeated uint64 offset = 5; } /** * A layer that computes the elementwise average of the inputs. * This layer has limited broadcasting support. For general broadcasting see AddBroadcastableLayer. * * .. code:: * * y = AverageLayer(x1,x2,...) * * Requires multiple inputs and produces 1 output. * * Input * In general, there are no rank constraints. * However, only certain set of shapes are broadcastable. For example: * [B, 1, 1, 1], [B, C, 1, 1], [B, 1, H, W], [B, C, H, W] * Output * A blob with the same shape as each input. */ message AverageLayerParams { } /** * A layer that computes the elementwise maximum over the inputs. * * .. code:: * * y = MaxLayer(x1,x2,...) * * Requires multiple inputs and produces 1 output. * * Input * In general, there are no rank constraints. * However, only certain set of shapes are broadcastable. For example: * [B, C, 1, 1], [B, C, H, W] * Output * A blob with the same shape as each input. */ message MaxLayerParams { } /** * A layer that computes the elementwise minimum over the inputs. * * .. code:: * * y = MinLayer(x1,x2,...) * * Requires multiple inputs and produces 1 output. * * Input * In general, there are no rank constraints. * However, only certain set of shapes are broadcastable. For example: * [B, C, 1, 1], [B, C, H, W] * Output * A blob with the same shape as each input. */ message MinLayerParams { } /** * A layer that computes the dot product of two vectors. * * .. code:: * * y = DotProductLayer(x1,x2) * * Requires 2 inputs and produces 1 output. * * Input * Two blobs with rank at least 3, such that the last two dimensions must be 1. * e.g.: blobs with shape [B, C, 1, 1]. * For ranks greater than 3, the leading dimensions, starting from 0 to -4 (inclusive), are all treated as batch. * * Output * Same rank as the input. * e.g. for rank 4 inputs, output shape: [B, 1, 1, 1] */ message DotProductLayerParams { /** * If true, inputs are normalized first, * thereby computing the cosine similarity. */ bool cosineSimilarity = 1; } /** * A layer that performs mean variance normalization, along axis = -3. * * .. code:: * * y = MeanVarianceNormalizeLayer(x) * * Requires 1 input and produces 1 output. * * Input * A blob with rank greater than equal to 3. * Example: Rank 4 blob represents [Batch, channels, height, width] * For ranks greater than 3, the leading dimensions, starting from 0 to -4 (inclusive), are all treated as batch. * * Output * A blob with the same shape as the input. * * If acrossChannels == true * normalization is performed on flattened input, i.e. the input is reshaped to (Batch,C), where "Batch" contains * all dimensions from 0 to -4 (inclusive), and C contains dimensions -1, -2, -3. * * If acrossChannels == false * normalization is performed within a channel, * across spatial dimensions (i.e. last two dimensions). */ message MeanVarianceNormalizeLayerParams { /** * If true, mean and variance are computed across channels. */ bool acrossChannels = 1; /** * If false, only mean is subtracted. */ bool normalizeVariance = 2; /** * A small constant to avoid division by 0 while normalizing variance. * Defaults to 1e-6 if not set or set to 0. */ float epsilon = 3; } /** * A layer that repeats a sequence or the dimension sitting at axis = -5 * * .. code:: * * y = SequenceRepeatLayer(x) * * Requires 1 input and produces 1 output. * * Input * A blob with rank at least 5. * e.g: shape [Seq, B, C, H, W] * Output * A blob with the same rank as the input. * e.g.: for input shape [Seq, B, C, H, W], output shape is [nRepetitions * Seq, B, C, H, W]. */ message SequenceRepeatLayerParams { /** * Number of repetitions. * Defaults to 1 if not set or set to 0. */ uint64 nRepetitions = 1; } /// Recurrent Layers /// ---------------- /* * The following activations are supported with recurrent layers: * - Linear * - Sigmoid * - Tanh * - ReLU * - Scaled Hyperbolic Tangent: alpha * tanh(beta * x), currently only supported for alpha = 1.7159, beta = 2/3 * - Hard Sigmoid: min(max(alpha * x + beta, 0), 1), currently only supported for alpha = 0.2, beta = 0.5 */ /** * A simple recurrent layer. * * .. code:: * * y_t = SimpleRecurrentLayer(x_t, y_{t-1}) * * Input * A blob of rank 5, with shape [Seq, Batch, inputVectorSize, 1, 1]. * This represents a sequence of vectors of size inputVectorSize. * Output * Same rank as the input. * Represents a vector of size outputVectorSize. It is either the final output or a sequence of outputs at all time steps. * * - Output Shape: [1, Batch, outputVectorSize, 1, 1] , if sequenceOutput == false * - Output Shape: [Seq, Batch, outputVectorSize, 1, 1] , if sequenceOutput == true * * This layer is described by the following equation: * * .. math:: * \boldsymbol{y_t} = f(\mathrm{clip}(W \boldsymbol{x_t} + \ * R \boldsymbol{y_{t-1}} + b)) * * - W is a 2-dimensional weight matrix * ([outputVectorSize, inputVectorSize], row-major) * - R is a 2-dimensional recursion matrix * ([outputVectorSize, outputVectorSize], row-major) * - b is a 1-dimensional bias vector ([outputVectorSize]) * - f() is an activation * - clip() is a function that constrains values between [-50.0, 50.0] */ message SimpleRecurrentLayerParams { uint64 inputVectorSize = 1; /// The size of the input vectors. uint64 outputVectorSize = 2; /// The size of the output vectors. /** * Activations supported are Linear, Sigmoid, Tanh, ReLU, Scaled Tanh (alpha = 1.71, beta = 2/3), Hard sigmoid (alpha = 0.2, beta = 0.5) */ ActivationParams activation = 10; /// The activation function. /** If false output is just the result after final state update. If true, output is a sequence, containing outputs at all time steps. */ bool sequenceOutput = 15; bool hasBiasVector = 20; /// If false, no bias is added. WeightParams weightMatrix = 30; /// Weight matrix W. WeightParams recursionMatrix = 31; /// Recursion Weight matrix R. WeightParams biasVector = 32; /// Bias vector b. bool reverseInput = 100; // If true, then the node processes the input sequence from right to left } /** * Gated-Recurrent Unit (GRU) Layer * * .. code:: * * y_t = GRULayer(x_t, y_{t-1}) * * Input * A blob of rank 5, with shape [Seq, Batch, inputVectorSize, 1, 1]. * This represents a sequence of vectors of size inputVectorSize. * Output * Same rank as the input. * Represents a vector of size outputVectorSize. It is either the final output or a sequence of outputs at all time steps. * * - Output Shape: [1, Batch, outputVectorSize, 1, 1] , if sequenceOutput == false * - Output Shape: [Seq, Batch, outputVectorSize, 1, 1] , if sequenceOutput == true * * This layer is described by the following equations: * * Update Gate * .. math:: * \boldsymbol{z_t} = \ * f(\mathrm{clip}(W_z \boldsymbol{x_t} + \ * R_z \boldsymbol{y_{t-1}} + b_z) * * Reset Gate * .. math:: * \boldsymbol{r_t} = \ * f(\mathrm{clip}(W_r \boldsymbol{x_t} + \ * R_r \boldsymbol{y_{t-1}} + b_r)) * * Cell Memory State * .. math:: * \boldsymbol{c_t} = \ * \boldsymbol{y_{t-1}} \odot \boldsymbol{r_t} * * Output Gate * .. math:: * \boldsymbol{o_t} = \ * g(\mathrm{clip}(W_o \boldsymbol{x_t} + \ * R_o \boldsymbol{c_t} + b_o)) * * Output * .. math:: * \boldsymbol{y_t} = \ * (1 - \boldsymbol{z_t}) \odot \boldsymbol{o_t} + \ * \boldsymbol{z_t} \odot \boldsymbol{y_{t-1}} * * - W_z, W_r, W_o are 2-dimensional input weight matrices * ([outputVectorSize, inputVectorSize], row-major) * - R_z, R_r, R_o are 2-dimensional recursion matrices * ([outputVectorSize, outputVectorSize], row-major) * - b_z, b_r, b_o are 1-dimensional bias vectors * ([outputVectorSize]) * - f(), g() are activations * - clip() is a function that constrains values between [-50.0, 50.0] * - ⊙ denotes the elementwise product of matrices */ message GRULayerParams { uint64 inputVectorSize = 1; /// Size of the input vectors. uint64 outputVectorSize = 2; /// Size of the output vectors. /** * 2 element array representing activations [f(), g()] in that order. * Typical values used = [sigmoid, tanh]. * Activations supported are Linear, Sigmoid, Tanh, ReLU, Scaled Tanh (alpha = 1.71, beta = 2/3), Hard sigmoid (alpha = 0.2, beta = 0.5) */ repeated ActivationParams activations = 10; /** * If false output is just the result after final state update. * If true, output is a sequence, containing outputs at all time steps. */ bool sequenceOutput = 15; /** * If false, no biases (b_z, b_r, b_o) are added. */ bool hasBiasVectors = 20; WeightParams updateGateWeightMatrix = 30; /// Weight Matrix W_z. WeightParams resetGateWeightMatrix = 31; /// Weight Matrix W_r. WeightParams outputGateWeightMatrix = 32; /// Weight Matrix W_o. WeightParams updateGateRecursionMatrix = 50; /// Recursion Weight Matrix R_z. WeightParams resetGateRecursionMatrix = 51; /// Recursion Weight Matrix R_r. WeightParams outputGateRecursionMatrix = 52; /// Recursion Weight Matrix R_o. WeightParams updateGateBiasVector = 70; /// Bias vector b_z. WeightParams resetGateBiasVector = 71; /// Bias vector b_r. WeightParams outputGateBiasVector = 72; /// Bias vector b_o. /// If true, then the node processes the input sequence from right to left bool reverseInput = 100; } /** * Long short-term memory (LSTM) parameters. * * This is described by the following equations: * * Input Gate * .. math:: * \boldsymbol{i_t} = \ * f(\mathrm{clip}(W_i \boldsymbol{x_t} + \ * R_i \boldsymbol{y_{t-1}} + \ * p_i \odot c_{t-1} + b_i)) * * Forget Gate * .. math:: * \boldsymbol{f_t} = \ * f(\mathrm{clip}(W_f \boldsymbol{x_t} + \ * R_f \boldsymbol{y_{t-1}} + \ * p_f \odot c_{t-1} + b_f)) * * Block Input * .. math:: * \boldsymbol{z_t} = \ * g(\mathrm{clip}(W_z \boldsymbol{x_t} + \ * R_z \boldsymbol{y_{t-1}} + b_z)) * * Cell Memory State * .. math:: * \boldsymbol{c_t} = \ * \boldsymbol{c_{t-1}} \odot \boldsymbol{f_t} + \ * \boldsymbol{i_t} \odot \boldsymbol{z_t} * * Output Gate * .. math:: * \boldsymbol{o_t} = \ * f(\mathrm{clip}(W_o \boldsymbol{x_t} + \ * R_o \boldsymbol{y_{t-1}} + \ * p_o \odot c_t + b_o)) * * Output * .. math:: * \boldsymbol{y_t} = \ * h(\boldsymbol{c_t}) \odot \boldsymbol{o_t} * * - W_i, W_f, W_z, W_o are 2-dimensional input weight matrices * ([outputVectorSize, inputVectorSize], row-major) * - R_i, R_f, R_z, R_o are 2-dimensional recursion matrices * ([outputVectorSize, outputVectorSize], row-major) * - b_i, b_f, b_z, b_o are 1-dimensional bias vectors * ([outputVectorSize]) * - p_, p_f, p_o are 1-dimensional peephole vectors * ([outputVectorSize]) * - f(), g(), h() are activations * - clip() is a function that constrains values between [-50.0, 50.0] * - ⊙ denotes the elementwise product of matrices */ message LSTMParams { /** * If true, output is a sequence, containing outputs at all time steps. * If false, output is just the result after final state update. */ bool sequenceOutput = 10; /** * If false, no biases (b_i, b_f, b_z, b_o) are added. */ bool hasBiasVectors = 20; /** * If true, a vector of 1 values is added to b_f. */ bool forgetBias = 30; /** * If true, peephole vectors are included. */ bool hasPeepholeVectors = 40; /** * If the coupled Input and Forget flag is on, the behaviour of * c_t is changed to the following (i.e. forget gate is not used): * * .. math:: * \boldsymbol{c_t} = \ * \boldsymbol{c_{t-1}} \odot (1 - \boldsymbol{i_t}) + \ * \boldsymbol{i_t} \odot \boldsymbol{z_t} * */ bool coupledInputAndForgetGate = 50; /** * Places a limit on the maximum and minimum values of c_t. * c_t = min(c_t, cellClipThreshold) * c_t = max(c_t, -cellClipThreshold) * If 0, it is set to its default value = 50.0. */ float cellClipThreshold = 60; } /** * Weights for long short-term memory (LSTM) layers */ message LSTMWeightParams { WeightParams inputGateWeightMatrix = 1; /// Weight Matrix W_i. WeightParams forgetGateWeightMatrix = 2; /// Weight Matrix W_f. WeightParams blockInputWeightMatrix = 3; /// Weight Matrix W_z. WeightParams outputGateWeightMatrix = 4; /// Weight Matrix W_o. WeightParams inputGateRecursionMatrix = 20; /// Recursion Weight Matrix R_i. WeightParams forgetGateRecursionMatrix = 21; /// Recursion Weight Matrix R_f. WeightParams blockInputRecursionMatrix = 22; /// Recursion Weight Matrix R_z. WeightParams outputGateRecursionMatrix = 23; /// Recursion Weight Matrix R_o. //biases: WeightParams inputGateBiasVector = 40; /// Bias vector b_i. WeightParams forgetGateBiasVector = 41; /// Bias vector b_f. WeightParams blockInputBiasVector = 42; /// Bias vector b_z. WeightParams outputGateBiasVector = 43; /// Bias vector b_o. //peepholes: WeightParams inputGatePeepholeVector = 60; /// Peephole vector p_i. WeightParams forgetGatePeepholeVector = 61; /// Peephole vector p_f. WeightParams outputGatePeepholeVector = 62; /// Peephole vector p_o. } /** * A unidirectional long short-term memory (LSTM) layer. * * .. code:: * * (y_t, c_t) = UniDirectionalLSTMLayer(x_t, y_{t-1}, c_{t-1}) * * Input * A blob of rank 5, with shape [Seq, Batch, inputVectorSize, 1, 1]. * This represents a sequence of vectors of size inputVectorSize. * Output * Same rank as the input. * Represents a vector of size outputVectorSize. It is either the final output or a sequence of outputs at all time steps. * * - Output Shape: [1, Batch, outputVectorSize, 1, 1] , if sequenceOutput == false * - Output Shape: [Seq, Batch, outputVectorSize, 1, 1] , if sequenceOutput == true * */ message UniDirectionalLSTMLayerParams { uint64 inputVectorSize = 1; /// Size of the input vectors. uint64 outputVectorSize = 2; /// Size of the output vectors. /** * 3 element array representing activations [f(),g(),h()] in that order. * Typical values used = [sigmoid, tanh, tanh]. * Activations supported are Linear, Sigmoid, Tanh, ReLU, Scaled Tanh (alpha = 1.71, beta = 2/3), Hard sigmoid (alpha = 0.2, beta = 0.5) */ repeated ActivationParams activations = 10; LSTMParams params = 15; LSTMWeightParams weightParams = 20; /// Weights, biases and peepholes. /// If true, then the node processes the input sequence from right to left bool reverseInput = 100; } /** * Bidirectional long short-term memory (LSTM) layer * * .. code:: * * (y_t, c_t, y_t_reverse, c_t_reverse) = BiDirectionalLSTMLayer(x_t, y_{t-1}, c_{t-1}, y_{t-1}_reverse, c_{t-1}_reverse) * * Input * A blob of rank 5, with shape [Seq, Batch, inputVectorSize, 1, 1]. * This represents a sequence of vectors of size inputVectorSize. * Output * Same rank as the input. * Represents a vector of size 2 * outputVectorSize. It is either the final output or a sequence of outputs at all time steps. * * - Output Shape: [1, Batch, 2 * outputVectorSize, 1, 1] , if sequenceOutput == false * - Output Shape: [Seq, Batch, 2 * outputVectorSize, 1, 1] , if sequenceOutput == true * * * The first LSTM operates on the input sequence in the forward direction. * The second LSTM operates on the input sequence in the reverse direction. * * Example: given the input sequence [x_1, x_2, x_3], * where x_i are vectors at time index i: * * The forward LSTM output is [yf_1, yf_2, yf_3], * * where yf_i are vectors of size outputVectorSize: * * - yf_1 is the output at the end of sequence {x_1} * - yf_2 is the output at the end of sequence {x_1, x_2} * - yf_3 is the output at the end of sequence {x_1, x_2, x_3} * * The backward LSTM output: [yb_1, yb_2, yb_3], * * where yb_i are vectors of size outputVectorSize: * * - yb_1 is the output at the end of sequence {x_3} * - yb_2 is the output at the end of sequence {x_3, x_2} * - yb_3 is the output at the end of sequence {x_3, x_2, x_1} * * Output of the bi-dir layer: * * - if sequenceOutput = True : { [yf_1, yb_3], [yf_2, yb_2], [yf_3, yb_1] } * - if sequenceOutput = False : { [yf_3, yb_3] } */ message BiDirectionalLSTMLayerParams { /** * Size of the input vectors. */ uint64 inputVectorSize = 1; /** * Size of the outputs vectors. * It is same for both forward and backward LSTMs.